[43/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

thammegowda Sat, 16 Jul 2016 12:48:49 -0700

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java 
b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java
new file mode 100644
index 0000000..78ccb27
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizer.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import java.net.MalformedURLException;
+
+import org.apache.hadoop.conf.Configurable;
+
+/**
+ * Interface used to convert URLs to normal form and optionally perform
+ * substitutions
+ */
+public interface URLNormalizer extends Configurable {
+
+  /* Extension ID */
+  public static final String X_POINT_ID = URLNormalizer.class.getName();
+
+  /* Interface for URL normalization */
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException;
+
+}


http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java 
b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java
new file mode 100644
index 0000000..d8f1c6e
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizerChecker.java
@@ -0,0 +1,117 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.util.NutchConfiguration;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+
+/**
+ * Checks one given normalizer or all normalizers.
+ */
+public class URLNormalizerChecker {
+
+  private Configuration conf;
+
+  public URLNormalizerChecker(Configuration conf) {
+    this.conf = conf;
+  }
+
+  private void checkOne(String normalizerName, String scope) throws Exception {
+    URLNormalizer normalizer = null;
+
+    ExtensionPoint point = PluginRepository.get(conf).getExtensionPoint(
+        URLNormalizer.X_POINT_ID);
+
+    if (point == null)
+      throw new RuntimeException(URLNormalizer.X_POINT_ID + " not found.");
+
+    Extension[] extensions = point.getExtensions();
+
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      normalizer = (URLNormalizer) extension.getExtensionInstance();
+      if (normalizer.getClass().getName().equals(normalizerName)) {
+        break;
+      } else {
+        normalizer = null;
+      }
+    }
+
+    if (normalizer == null)
+      throw new RuntimeException("URLNormalizer " + normalizerName
+          + " not found.");
+
+    System.out.println("Checking URLNormalizer " + normalizerName);
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      String out = normalizer.normalize(line, scope);
+      System.out.println(out);
+    }
+  }
+
+  private void checkAll(String scope) throws Exception {
+    System.out.println("Checking combination of all URLNormalizers available");
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    URLNormalizers normalizers = new URLNormalizers(conf, scope);
+    while ((line = in.readLine()) != null) {
+      String out = normalizers.normalize(line, scope);
+      System.out.println(out);
+    }
+  }
+
+  public static void main(String[] args) throws Exception {
+
+    String usage = "Usage: URLNormalizerChecker [-normalizer <normalizerName>] 
[-scope <scope>]"
+        + "\n\tscope can be one of: 
default,partition,generate_host_count,fetcher,crawldb,linkdb,inject,outlink";
+
+    String normalizerName = null;
+    String scope = URLNormalizers.SCOPE_DEFAULT;
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-normalizer")) {
+        normalizerName = args[++i];
+      } else if (args[i].equals("-scope")) {
+        scope = args[++i];
+      } else {
+        System.err.println(usage);
+        System.exit(-1);
+      }
+    }
+
+    URLNormalizerChecker checker = new URLNormalizerChecker(
+        NutchConfiguration.create());
+    if (normalizerName != null) {
+      checker.checkOne(normalizerName, scope);
+    } else {
+      checker.checkAll(scope);
+    }
+
+    System.exit(0);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java 
b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java
new file mode 100644
index 0000000..7a34353
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/URLNormalizers.java
@@ -0,0 +1,325 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net;
+
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.Vector;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.ExtensionPoint;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.plugin.PluginRuntimeException;
+import org.apache.nutch.util.ObjectCache;
+
+/**
+ * This class uses a "chained filter" pattern to run defined normalizers.
+ * Different lists of normalizers may be defined for different "scopes", or
+ * contexts where they are used (note however that they need to be activated
+ * first through <tt>plugin.include</tt> property).
+ * 
+ * <p>
+ * There is one global scope defined by default, which consists of all active
+ * normalizers. The order in which these normalizers are executed may be 
defined
+ * in "urlnormalizer.order" property, which lists space-separated 
implementation
+ * classes (if this property is missing normalizers will be run in random
+ * order). If there are more normalizers activated than explicitly named on 
this
+ * list, the remaining ones will be run in random order after the ones 
specified
+ * on the list are executed.
+ * </p>
+ * <p>
+ * You can define a set of contexts (or scopes) in which normalizers may be
+ * called. Each scope can have its own list of normalizers (defined in
+ * "urlnormalizer.scope.<scope_name>" property) and its own order (defined in
+ * "urlnormalizer.order.<scope_name>" property). If any of these properties are
+ * missing, default settings are used for the global scope.
+ * </p>
+ * <p>
+ * In case no normalizers are required for any given scope, a
+ * <code>org.apache.nutch.net.urlnormalizer.pass.PassURLNormalizer</code> 
should
+ * be used.
+ * </p>
+ * <p>
+ * Each normalizer may further select among many configurations, depending on
+ * the scope in which it is called, because the scope name is passed as a
+ * parameter to each normalizer. You can also use the same normalizer for many
+ * scopes.
+ * </p>
+ * <p>
+ * Several scopes have been defined, and various Nutch tools will attempt using
+ * scope-specific normalizers first (and fall back to default config if
+ * scope-specific configuration is missing).
+ * </p>
+ * <p>
+ * Normalizers may be run several times, to ensure that modifications 
introduced
+ * by normalizers at the end of the list can be further reduced by normalizers
+ * executed at the beginning. By default this loop is executed just once - if
+ * you want to ensure that all possible combinations have been applied you may
+ * want to run this loop up to the number of activated normalizers. This loop
+ * count can be configured through <tt>urlnormalizer.loop.count</tt> property.
+ * As soon as the url is unchanged the loop will stop and return the result.
+ * </p>
+ * 
+ * @author Andrzej Bialecki
+ */
+public final class URLNormalizers {
+
+  /**
+   * Default scope. If no scope properties are defined then the configuration
+   * for this scope will be used.
+   */
+  public static final String SCOPE_DEFAULT = "default";
+  /** Scope used by {@link org.apache.nutch.crawl.URLPartitioner}. */
+  public static final String SCOPE_PARTITION = "partition";
+  /** Scope used by {@link org.apache.nutch.crawl.Generator}. */
+  public static final String SCOPE_GENERATE_HOST_COUNT = "generate_host_count";
+  /**
+   * Scope used by {@link org.apache.nutch.fetcher.Fetcher} when processing
+   * redirect URLs.
+   */
+  public static final String SCOPE_FETCHER = "fetcher";
+  /** Scope used when updating the CrawlDb with new URLs. */
+  public static final String SCOPE_CRAWLDB = "crawldb";
+  /** Scope used when updating the LinkDb with new URLs. */
+  public static final String SCOPE_LINKDB = "linkdb";
+  /** Scope used by {@link org.apache.nutch.crawl.Injector}. */
+  public static final String SCOPE_INJECT = "inject";
+  /**
+   * Scope used when constructing new {@link org.apache.nutch.parse.Outlink}
+   * instances.
+   */
+  public static final String SCOPE_OUTLINK = "outlink";
+  /** Scope used when indexing URLs. */
+  public static final String SCOPE_INDEXER = "indexer";
+
+  public static final Logger LOG = LoggerFactory
+      .getLogger(URLNormalizers.class);
+
+  /* Empty extension list for caching purposes. */
+  private final List<Extension> EMPTY_EXTENSION_LIST = Collections
+      .<Extension> emptyList();
+
+  private final URLNormalizer[] EMPTY_NORMALIZERS = new URLNormalizer[0];
+
+  private Configuration conf;
+
+  private ExtensionPoint extensionPoint;
+
+  private URLNormalizer[] normalizers;
+
+  private int loopCount;
+
+  public URLNormalizers(Configuration conf, String scope) {
+    this.conf = conf;
+    this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(
+        URLNormalizer.X_POINT_ID);
+    ObjectCache objectCache = ObjectCache.get(conf);
+
+    if (this.extensionPoint == null) {
+      throw new RuntimeException("x point " + URLNormalizer.X_POINT_ID
+          + " not found.");
+    }
+
+    normalizers = (URLNormalizer[]) objectCache
+        .getObject(URLNormalizer.X_POINT_ID + "_" + scope);
+    if (normalizers == null) {
+      normalizers = getURLNormalizers(scope);
+    }
+    if (normalizers == EMPTY_NORMALIZERS) {
+      normalizers = (URLNormalizer[]) objectCache
+          .getObject(URLNormalizer.X_POINT_ID + "_" + SCOPE_DEFAULT);
+      if (normalizers == null) {
+        normalizers = getURLNormalizers(SCOPE_DEFAULT);
+      }
+    }
+
+    loopCount = conf.getInt("urlnormalizer.loop.count", 1);
+  }
+
+  /**
+   * Function returns an array of {@link URLNormalizer}s for a given scope, 
with
+   * a specified order.
+   * 
+   * @param scope
+   *          The scope to return the <code>Array</code> of
+   *          {@link URLNormalizer}s for.
+   * @return An <code>Array</code> of {@link URLNormalizer}s for the given
+   *         scope.
+   * @throws PluginRuntimeException
+   */
+  URLNormalizer[] getURLNormalizers(String scope) {
+    List<Extension> extensions = getExtensions(scope);
+    ObjectCache objectCache = ObjectCache.get(conf);
+
+    if (extensions == EMPTY_EXTENSION_LIST) {
+      return EMPTY_NORMALIZERS;
+    }
+
+    List<URLNormalizer> normalizers = new Vector<URLNormalizer>(
+        extensions.size());
+
+    Iterator<Extension> it = extensions.iterator();
+    while (it.hasNext()) {
+      Extension ext = it.next();
+      URLNormalizer normalizer = null;
+      try {
+        // check to see if we've cached this URLNormalizer instance yet
+        normalizer = (URLNormalizer) objectCache.getObject(ext.getId());
+        if (normalizer == null) {
+          // go ahead and instantiate it and then cache it
+          normalizer = (URLNormalizer) ext.getExtensionInstance();
+          objectCache.setObject(ext.getId(), normalizer);
+        }
+        normalizers.add(normalizer);
+      } catch (PluginRuntimeException e) {
+        e.printStackTrace();
+        LOG.warn("URLNormalizers:PluginRuntimeException when "
+            + "initializing url normalizer plugin "
+            + ext.getDescriptor().getPluginId()
+            + " instance in getURLNormalizers "
+            + "function: attempting to continue instantiating plugins");
+      }
+    }
+    return normalizers.toArray(new URLNormalizer[normalizers.size()]);
+  }
+
+  /**
+   * Finds the best-suited normalizer plugin for a given scope.
+   * 
+   * @param scope
+   *          Scope for which we seek a normalizer plugin.
+   * @return a list of extensions to be used for this scope. If none, returns
+   *         empty list.
+   * @throws PluginRuntimeException
+   */
+  @SuppressWarnings("unchecked")
+  private List<Extension> getExtensions(String scope) {
+    ObjectCache objectCache = ObjectCache.get(conf);
+    List<Extension> extensions = (List<Extension>) objectCache
+        .getObject(URLNormalizer.X_POINT_ID + "_x_" + scope);
+
+    // Just compare the reference:
+    // if this is the empty list, we know we will find no extension.
+    if (extensions == EMPTY_EXTENSION_LIST) {
+      return EMPTY_EXTENSION_LIST;
+    }
+
+    if (extensions == null) {
+      extensions = findExtensions(scope);
+      if (extensions != null) {
+        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope,
+            extensions);
+      } else {
+        // Put the empty extension list into cache
+        // to remember we don't know any related extension.
+        objectCache.setObject(URLNormalizer.X_POINT_ID + "_x_" + scope,
+            EMPTY_EXTENSION_LIST);
+        extensions = EMPTY_EXTENSION_LIST;
+      }
+    }
+    return extensions;
+  }
+
+  /**
+   * searches a list of suitable url normalizer plugins for the given scope.
+   * 
+   * @param scope
+   *          Scope for which we seek a url normalizer plugin.
+   * @return List - List of extensions to be used for this scope. If none,
+   *         returns null.
+   * @throws PluginRuntimeException
+   */
+  private List<Extension> findExtensions(String scope) {
+
+    String[] orders = null;
+    String orderlist = conf.get("urlnormalizer.order." + scope);
+    if (orderlist == null)
+      orderlist = conf.get("urlnormalizer.order");
+    if (orderlist != null && !orderlist.trim().equals("")) {
+      orders = orderlist.trim().split("\\s+");
+    }
+    String scopelist = conf.get("urlnormalizer.scope." + scope);
+    Set<String> impls = null;
+    if (scopelist != null && !scopelist.trim().equals("")) {
+      String[] names = scopelist.split("\\s+");
+      impls = new HashSet<String>(Arrays.asList(names));
+    }
+    Extension[] extensions = this.extensionPoint.getExtensions();
+    HashMap<String, Extension> normalizerExtensions = new HashMap<String, 
Extension>();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (impls != null && !impls.contains(extension.getClazz()))
+        continue;
+      normalizerExtensions.put(extension.getClazz(), extension);
+    }
+    List<Extension> res = new ArrayList<Extension>();
+    if (orders == null) {
+      res.addAll(normalizerExtensions.values());
+    } else {
+      // first add those explicitly named in correct order
+      for (int i = 0; i < orders.length; i++) {
+        Extension e = normalizerExtensions.get(orders[i]);
+        if (e != null) {
+          res.add(e);
+          normalizerExtensions.remove(orders[i]);
+        }
+      }
+      // then add all others in random order
+      res.addAll(normalizerExtensions.values());
+    }
+    return res;
+  }
+
+  /**
+   * Normalize
+   * 
+   * @param urlString
+   *          The URL string to normalize.
+   * @param scope
+   *          The given scope.
+   * @return A normalized String, using the given <code>scope</code>
+   * @throws MalformedURLException
+   *           If the given URL string is malformed.
+   */
+  public String normalize(String urlString, String scope)
+      throws MalformedURLException {
+    // optionally loop several times, and break if no further changes
+    String initialString = urlString;
+    for (int k = 0; k < loopCount; k++) {
+      for (int i = 0; i < this.normalizers.length; i++) {
+        if (urlString == null)
+          return null;
+        urlString = this.normalizers[i].normalize(urlString, scope);
+      }
+      if (initialString.equals(urlString))
+        break;
+      initialString = urlString;
+    }
+    return urlString;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/net/package-info.java 
b/nutch-core/src/main/java/org/apache/nutch/net/package-info.java
new file mode 100644
index 0000000..19e0111
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Web-related interfaces: URL {@link org.apache.nutch.net.URLFilter filters}
+ * and {@link org.apache.nutch.net.URLNormalizer normalizers}.
+ */
+package org.apache.nutch.net;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java 
b/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java
new file mode 100644
index 0000000..5f4115b
--- /dev/null
+++ 
b/nutch-core/src/main/java/org/apache/nutch/net/protocols/HttpDateFormat.java
@@ -0,0 +1,124 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.protocols;
+
+import java.util.Calendar;
+import java.util.Date;
+import java.util.Locale;
+import java.util.TimeZone;
+import java.text.SimpleDateFormat;
+import java.text.ParseException;
+
+/**
+ * class to handle HTTP dates.
+ * 
+ * Modified from FastHttpDateFormat.java in jakarta-tomcat.
+ * 
+ * @author John Xing
+ */
+public class HttpDateFormat {
+
+  protected static SimpleDateFormat format = new SimpleDateFormat(
+      "EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);
+
+  /**
+   * HTTP date uses TimeZone GMT
+   */
+  static {
+    format.setTimeZone(TimeZone.getTimeZone("GMT"));
+  }
+
+  // HttpDate (long t) {
+  // }
+
+  // HttpDate (String s) {
+  // }
+
+  // /**
+  // * Get the current date in HTTP format.
+  // */
+  // public static String getCurrentDate() {
+  //
+  // long now = System.currentTimeMillis();
+  // if ((now - currentDateGenerated) > 1000) {
+  // synchronized (format) {
+  // if ((now - currentDateGenerated) > 1000) {
+  // currentDateGenerated = now;
+  // currentDate = format.format(new Date(now));
+  // }
+  // }
+  // }
+  // return currentDate;
+  //
+  // }
+
+  /**
+   * Get the HTTP format of the specified date.
+   */
+  public static String toString(Date date) {
+    String string;
+    synchronized (format) {
+      string = format.format(date);
+    }
+    return string;
+  }
+
+  public static String toString(Calendar cal) {
+    String string;
+    synchronized (format) {
+      string = format.format(cal.getTime());
+    }
+    return string;
+  }
+
+  public static String toString(long time) {
+    String string;
+    synchronized (format) {
+      string = format.format(new Date(time));
+    }
+    return string;
+  }
+
+  public static Date toDate(String dateString) throws ParseException {
+    Date date;
+    synchronized (format) {
+      date = format.parse(dateString);
+    }
+    return date;
+  }
+
+  public static long toLong(String dateString) throws ParseException {
+    long time;
+    synchronized (format) {
+      time = format.parse(dateString).getTime();
+    }
+    return time;
+  }
+
+  public static void main(String[] args) throws Exception {
+    Date now = new Date(System.currentTimeMillis());
+
+    String string = HttpDateFormat.toString(now);
+
+    long time = HttpDateFormat.toLong(string);
+
+    System.out.println(string);
+    System.out.println(HttpDateFormat.toString(time));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java
 
b/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java
new file mode 100644
index 0000000..0ae3776
--- /dev/null
+++ 
b/nutch-core/src/main/java/org/apache/nutch/net/protocols/ProtocolException.java
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.protocols;
+
+import java.io.Serializable;
+
+/**
+ * Base exception for all protocol handlers
+ * 
+ * @deprecated Use {@link org.apache.nutch.protocol.ProtocolException} instead.
+ */
+@Deprecated
+@SuppressWarnings("serial")
+public class ProtocolException extends Exception implements Serializable {
+
+  public ProtocolException() {
+    super();
+  }
+
+  public ProtocolException(String message) {
+    super(message);
+  }
+
+  public ProtocolException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public ProtocolException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java 
b/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java
new file mode 100644
index 0000000..efff14b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/protocols/Response.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.net.protocols;
+
+// JDK imports
+import java.net.URL;
+
+// Nutch imports
+import org.apache.nutch.metadata.HttpHeaders;
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * A response interface. Makes all protocols model HTTP.
+ */
+public interface Response extends HttpHeaders {
+
+  /** Returns the URL used to retrieve this response. */
+  public URL getUrl();
+
+  /** Returns the response code. */
+  public int getCode();
+
+  /** Returns the value of a named header. */
+  public String getHeader(String name);
+
+  /** Returns all the headers. */
+  public Metadata getHeaders();
+
+  /** Returns the full content of the response. */
+  public byte[] getContent();
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java 
b/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java
new file mode 100644
index 0000000..8823f5b
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/net/protocols/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Helper classes related to the {@link org.apache.nutch.protocol.Protocol 
Protocol}
+ * interface, sea also {@link org.apache.nutch.protocol}.
+ */
+package org.apache.nutch.net.protocols;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java
new file mode 100644
index 0000000..c36c036
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/HTMLMetaTags.java
@@ -0,0 +1,203 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.net.URL;
+import java.util.Iterator;
+import java.util.Properties;
+
+import org.apache.nutch.metadata.Metadata;
+
+/**
+ * This class holds the information about HTML "meta" tags extracted from a
+ * page. Some special tags have convenience methods for easy checking.
+ */
+public class HTMLMetaTags {
+  private boolean noIndex = false;
+
+  private boolean noFollow = false;
+
+  private boolean noCache = false;
+
+  private URL baseHref = null;
+
+  private boolean refresh = false;
+
+  private int refreshTime = 0;
+
+  private URL refreshHref = null;
+
+  private Metadata generalTags = new Metadata();
+
+  private Properties httpEquivTags = new Properties();
+
+  /**
+   * Sets all boolean values to <code>false</code>. Clears all other tags.
+   */
+  public void reset() {
+    noIndex = false;
+    noFollow = false;
+    noCache = false;
+    refresh = false;
+    refreshTime = 0;
+    baseHref = null;
+    refreshHref = null;
+    generalTags.clear();
+    httpEquivTags.clear();
+  }
+
+  /**
+   * Sets <code>noFollow</code> to <code>true</code>.
+   */
+  public void setNoFollow() {
+    noFollow = true;
+  }
+
+  /**
+   * Sets <code>noIndex</code> to <code>true</code>.
+   */
+  public void setNoIndex() {
+    noIndex = true;
+  }
+
+  /**
+   * Sets <code>noCache</code> to <code>true</code>.
+   */
+  public void setNoCache() {
+    noCache = true;
+  }
+
+  /**
+   * Sets <code>refresh</code> to the supplied value.
+   */
+  public void setRefresh(boolean refresh) {
+    this.refresh = refresh;
+  }
+
+  /**
+   * Sets the <code>baseHref</code>.
+   */
+  public void setBaseHref(URL baseHref) {
+    this.baseHref = baseHref;
+  }
+
+  /**
+   * Sets the <code>refreshHref</code>.
+   */
+  public void setRefreshHref(URL refreshHref) {
+    this.refreshHref = refreshHref;
+  }
+
+  /**
+   * Sets the <code>refreshTime</code>.
+   */
+  public void setRefreshTime(int refreshTime) {
+    this.refreshTime = refreshTime;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>noIndex</code>.
+   */
+  public boolean getNoIndex() {
+    return noIndex;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>noFollow</code>.
+   */
+  public boolean getNoFollow() {
+    return noFollow;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>noCache</code>.
+   */
+  public boolean getNoCache() {
+    return noCache;
+  }
+
+  /**
+   * A convenience method. Returns the current value of <code>refresh</code>.
+   */
+  public boolean getRefresh() {
+    return refresh;
+  }
+
+  /**
+   * A convenience method. Returns the <code>baseHref</code>, if set, or
+   * <code>null</code> otherwise.
+   */
+  public URL getBaseHref() {
+    return baseHref;
+  }
+
+  /**
+   * A convenience method. Returns the <code>refreshHref</code>, if set, or
+   * <code>null</code> otherwise. The value may be invalid if
+   * {@link #getRefresh()}returns <code>false</code>.
+   */
+  public URL getRefreshHref() {
+    return refreshHref;
+  }
+
+  /**
+   * A convenience method. Returns the current value of 
<code>refreshTime</code>
+   * . The value may be invalid if {@link #getRefresh()}returns
+   * <code>false</code>.
+   */
+  public int getRefreshTime() {
+    return refreshTime;
+  }
+
+  /**
+   * Returns all collected values of the general meta tags. Property names are
+   * tag names, property values are "content" values.
+   */
+  public Metadata getGeneralTags() {
+    return generalTags;
+  }
+
+  /**
+   * Returns all collected values of the "http-equiv" meta tags. Property names
+   * are tag names, property values are "content" values.
+   */
+  public Properties getHttpEquivTags() {
+    return httpEquivTags;
+  }
+
+  public String toString() {
+    StringBuffer sb = new StringBuffer();
+    sb.append("base=" + baseHref + ", noCache=" + noCache + ", noFollow="
+        + noFollow + ", noIndex=" + noIndex + ", refresh=" + refresh
+        + ", refreshHref=" + refreshHref + "\n");
+    sb.append(" * general tags:\n");
+    String[] names = generalTags.names();
+    for (String name : names) {
+      String key = name;
+      sb.append("   - " + key + "\t=\t" + generalTags.get(key) + "\n");
+    }
+    sb.append(" * http-equiv tags:\n");
+    Iterator<Object> it = httpEquivTags.keySet().iterator();
+    it = httpEquivTags.keySet().iterator();
+    while (it.hasNext()) {
+      String key = (String) it.next();
+      sb.append("   - " + key + "\t=\t" + httpEquivTags.get(key) + "\n");
+    }
+    return sb.toString();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java
new file mode 100644
index 0000000..55b51ac
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilter.java
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+// JDK imports
+import org.w3c.dom.DocumentFragment;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configurable;
+
+// Nutch imports
+import org.apache.nutch.plugin.Pluggable;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * Extension point for DOM-based HTML parsers. Permits one to add additional
+ * metadata to HTML parses. All plugins found which implement this extension
+ * point are run sequentially on the parse.
+ */
+public interface HtmlParseFilter extends Pluggable, Configurable {
+  /** The name of the extension point. */
+  final static String X_POINT_ID = HtmlParseFilter.class.getName();
+
+  /**
+   * Adds metadata or otherwise modifies a parse of HTML content, given the DOM
+   * tree of a page.
+   */
+  ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc);
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java
new file mode 100644
index 0000000..9dd9aad
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/HtmlParseFilters.java
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.hadoop.conf.Configuration;
+
+import org.w3c.dom.DocumentFragment;
+
+/** Creates and caches {@link HtmlParseFilter} implementing plugins. */
+public class HtmlParseFilters {
+
+  private HtmlParseFilter[] htmlParseFilters;
+
+  public static final String HTMLPARSEFILTER_ORDER = "htmlparsefilter.order";
+
+  public HtmlParseFilters(Configuration conf) {
+    htmlParseFilters = (HtmlParseFilter[]) PluginRepository.get(conf)
+        .getOrderedPlugins(HtmlParseFilter.class, HtmlParseFilter.X_POINT_ID,
+            HTMLPARSEFILTER_ORDER);
+  }
+
+  /** Run all defined filters. */
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    // loop on each filter
+    for (int i = 0; i < this.htmlParseFilters.length; i++) {
+      // call filter interface
+      parseResult = htmlParseFilters[i].filter(content, parseResult, metaTags,
+          doc);
+
+      // any failure on parse obj, return
+      if (!parseResult.isSuccess()) {
+        // TODO: What happens when parseResult.isEmpty() ?
+        // Maybe clone parseResult and use parseResult as backup...
+
+        // remove failed parse before return
+        parseResult.filter();
+        return parseResult;
+      }
+    }
+
+    return parseResult;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java
new file mode 100644
index 0000000..3ee0354
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/Outlink.java
@@ -0,0 +1,135 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
+
+/* An outgoing link from a page. */
+public class Outlink implements Writable {
+
+  private String toUrl;
+  private String anchor;
+  private MapWritable md;
+
+  public Outlink() {
+  }
+
+  public Outlink(String toUrl, String anchor) throws MalformedURLException {
+    this.toUrl = toUrl;
+    if (anchor == null)
+      anchor = "";
+    this.anchor = anchor;
+    md = null;
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    toUrl = Text.readString(in);
+    anchor = Text.readString(in);
+    boolean hasMD = in.readBoolean();
+    if (hasMD) {
+      md = new org.apache.hadoop.io.MapWritable();
+      md.readFields(in);
+    } else
+      md = null;
+  }
+
+  /** Skips over one Outlink in the input. */
+  public static void skip(DataInput in) throws IOException {
+    Text.skip(in); // skip toUrl
+    Text.skip(in); // skip anchor
+    boolean hasMD = in.readBoolean();
+    if (hasMD) {
+      MapWritable metadata = new org.apache.hadoop.io.MapWritable();
+      metadata.readFields(in);
+      ;
+    }
+  }
+
+  public void write(DataOutput out) throws IOException {
+    Text.writeString(out, toUrl);
+    Text.writeString(out, anchor);
+    if (md != null && md.size() > 0) {
+      out.writeBoolean(true);
+      md.write(out);
+    } else {
+      out.writeBoolean(false);
+    }
+  }
+
+  public static Outlink read(DataInput in) throws IOException {
+    Outlink outlink = new Outlink();
+    outlink.readFields(in);
+    return outlink;
+  }
+
+  public String getToUrl() {
+    return toUrl;
+  }
+
+  public void setUrl(String toUrl) {
+    this.toUrl = toUrl;
+  }
+
+  public String getAnchor() {
+    return anchor;
+  }
+
+  public MapWritable getMetadata() {
+    return md;
+  }
+
+  public void setMetadata(MapWritable md) {
+    this.md = md;
+  }
+
+  public boolean equals(Object o) {
+    if (!(o instanceof Outlink))
+      return false;
+    Outlink other = (Outlink) o;
+    return this.toUrl.equals(other.toUrl) && this.anchor.equals(other.anchor);
+  }
+
+  public String toString() {
+    StringBuffer repr = new StringBuffer("toUrl: ");
+    repr.append(toUrl);
+    repr.append(" anchor: ");
+    repr.append(anchor);
+    if (md != null && !md.isEmpty()) {
+      for (Entry<Writable, Writable> e : md.entrySet()) {
+        repr.append(" ");
+        repr.append(e.getKey());
+        repr.append(": ");
+        repr.append(e.getValue());
+      }
+    }
+    return repr.toString();
+  }
+
+  @Override
+  public int hashCode() {
+    return toUrl.hashCode() ^ anchor.hashCode();
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java
new file mode 100644
index 0000000..d1773f8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java
@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.oro.text.regex.MatchResult;
+import org.apache.oro.text.regex.Pattern;
+import org.apache.oro.text.regex.PatternCompiler;
+import org.apache.oro.text.regex.PatternMatcher;
+import org.apache.oro.text.regex.PatternMatcherInput;
+import org.apache.oro.text.regex.Perl5Compiler;
+import org.apache.oro.text.regex.Perl5Matcher;
+
+/**
+ * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from
+ * plain text using Regular Expressions.
+ * 
+ * @see <a
+ *      
href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions";>Comparison
+ *      of different regexp-Implementations </a>
+ * @see <a href="http://regex.info/java.html";>Overview about Java Regexp APIs
+ *      </a>
+ * 
+ * @author Stephan Strittmatter - http://www.sybit.de
+ * @version 1.0
+ * @since 0.7
+ */
+public class OutlinkExtractor {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(OutlinkExtractor.class);
+
+  /**
+   * Regex pattern to get URLs within a plain text.
+   * 
+   * @see <a
+   *      
href="http://www.truerwords.net/articles/ut/urlactivation.html";>http://www.truerwords.net/articles/ut/urlactivation.html
+
+   *      </a>
+   */
+  private static final String URL_PATTERN = 
"([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+
+  /**
+   * Extracts <code>Outlink</code> from given plain text. Applying this method
+   * to non-plain-text can result in extremely lengthy runtimes for parasitic
+   * cases (postscript is a known example).
+   * 
+   * @param plainText
+   *          the plain text from wich URLs should be extracted.
+   * 
+   * @return Array of <code>Outlink</code>s within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText, Configuration 
conf) {
+    return OutlinkExtractor.getOutlinks(plainText, "", conf);
+  }
+
+  /**
+   * Extracts <code>Outlink</code> from given plain text and adds anchor to the
+   * extracted <code>Outlink</code>s
+   * 
+   * @param plainText
+   *          the plain text from wich URLs should be extracted.
+   * @param anchor
+   *          the anchor of the url
+   * 
+   * @return Array of <code>Outlink</code>s within found in plainText
+   */
+  public static Outlink[] getOutlinks(final String plainText, String anchor,
+      Configuration conf) {
+    long start = System.currentTimeMillis();
+    final List<Outlink> outlinks = new ArrayList<Outlink>();
+
+    try {
+      final PatternCompiler cp = new Perl5Compiler();
+      final Pattern pattern = cp.compile(URL_PATTERN,
+          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
+              | Perl5Compiler.MULTILINE_MASK);
+      final PatternMatcher matcher = new Perl5Matcher();
+
+      final PatternMatcherInput input = new PatternMatcherInput(plainText);
+
+      MatchResult result;
+      String url;
+
+      // loop the matches
+      while (matcher.contains(input, pattern)) {
+        // if this is taking too long, stop matching
+        // (SHOULD really check cpu time used so that heavily loaded systems
+        // do not unnecessarily hit this limit.)
+        if (System.currentTimeMillis() - start >= 60000L) {
+          if (LOG.isWarnEnabled()) {
+            LOG.warn("Time limit exceeded for getOutLinks");
+          }
+          break;
+        }
+        result = matcher.getMatch();
+        url = result.group(0);
+        try {
+          outlinks.add(new Outlink(url, anchor));
+        } catch (MalformedURLException mue) {
+          LOG.warn("Invalid url: '" + url + "', skipping.");
+        }
+      }
+    } catch (Exception ex) {
+      // if the matcher fails (perhaps a malformed URL) we just log it and move
+      // on
+      if (LOG.isErrorEnabled()) {
+        LOG.error("getOutlinks", ex);
+      }
+    }
+
+    final Outlink[] retval;
+
+    // create array of the Outlinks
+    if (outlinks != null && outlinks.size() > 0) {
+      retval = outlinks.toArray(new Outlink[0]);
+    } else {
+      retval = new Outlink[0];
+    }
+
+    return retval;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java
new file mode 100644
index 0000000..9a33445
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/Parse.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+/**
+ * The result of parsing a page's raw content.
+ * 
+ * @see Parser#getParse(Content)
+ */
+public interface Parse {
+
+  /**
+   * The textual content of the page. This is indexed, searched, and used when
+   * generating snippets.
+   */
+  String getText();
+
+  /** Other data extracted from the page. */
+  ParseData getData();
+
+  /** Indicates if the parse is coming from a url or a sub-url */
+  boolean isCanonical();
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java
new file mode 100644
index 0000000..12cae8a
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseCallable.java
@@ -0,0 +1,37 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.util.concurrent.Callable;
+
+import org.apache.nutch.protocol.Content;
+
+class ParseCallable implements Callable<ParseResult> {
+  private Parser p;
+  private Content content;
+
+  public ParseCallable(Parser p, Content content) {
+    this.p = p;
+    this.content = content;
+  }
+
+  @Override
+  public ParseResult call() throws Exception {
+    return p.getParse(content);
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java
new file mode 100644
index 0000000..8189269
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseData.java
@@ -0,0 +1,255 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.io.*;
+import java.util.*;
+
+import org.apache.commons.cli.Options;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.util.GenericOptionsParser;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.fs.FileSystem;
+
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * Data extracted from a page's content.
+ * 
+ * @see Parse#getData()
+ */
+public final class ParseData extends VersionedWritable {
+  public static final String DIR_NAME = "parse_data";
+
+  private final static byte VERSION = 5;
+
+  private String title;
+  private Outlink[] outlinks;
+  private Metadata contentMeta;
+  private Metadata parseMeta;
+  private ParseStatus status;
+  private byte version = VERSION;
+
+  public ParseData() {
+    contentMeta = new Metadata();
+    parseMeta = new Metadata();
+  }
+
+  public ParseData(ParseStatus status, String title, Outlink[] outlinks,
+      Metadata contentMeta) {
+    this(status, title, outlinks, contentMeta, new Metadata());
+  }
+
+  public ParseData(ParseStatus status, String title, Outlink[] outlinks,
+      Metadata contentMeta, Metadata parseMeta) {
+    this.status = status;
+    this.title = title;
+    this.outlinks = outlinks;
+    this.contentMeta = contentMeta;
+    this.parseMeta = parseMeta;
+  }
+
+  //
+  // Accessor methods
+  //
+
+  /** The status of parsing the page. */
+  public ParseStatus getStatus() {
+    return status;
+  }
+
+  /** The title of the page. */
+  public String getTitle() {
+    return title;
+  }
+
+  /** The outlinks of the page. */
+  public Outlink[] getOutlinks() {
+    return outlinks;
+  }
+
+  /** The original Metadata retrieved from content */
+  public Metadata getContentMeta() {
+    return contentMeta;
+  }
+
+  /**
+   * Other content properties. This is the place to find format-specific
+   * properties. Different parser implementations for different content types
+   * will populate this differently.
+   */
+  public Metadata getParseMeta() {
+    return parseMeta;
+  }
+
+  public void setParseMeta(Metadata parseMeta) {
+    this.parseMeta = parseMeta;
+  }
+
+  public void setOutlinks(Outlink[] outlinks) {
+    this.outlinks = outlinks;
+  }
+
+  /**
+   * Get a metadata single value. This method first looks for the metadata 
value
+   * in the parse metadata. If no value is found it the looks for the metadata
+   * in the content metadata.
+   * 
+   * @see #getContentMeta()
+   * @see #getParseMeta()
+   */
+  public String getMeta(String name) {
+    String value = parseMeta.get(name);
+    if (value == null) {
+      value = contentMeta.get(name);
+    }
+    return value;
+  }
+
+  //
+  // Writable methods
+  //
+
+  public byte getVersion() {
+    return version;
+  }
+
+  public final void readFields(DataInput in) throws IOException {
+
+    version = in.readByte();
+    // incompatible change from UTF8 (version < 5) to Text
+    if (version != VERSION)
+      throw new VersionMismatchException(VERSION, version);
+    status = ParseStatus.read(in);
+    title = Text.readString(in); // read title
+
+    int numOutlinks = in.readInt();
+    outlinks = new Outlink[numOutlinks];
+    for (int i = 0; i < numOutlinks; i++) {
+      outlinks[i] = Outlink.read(in);
+    }
+
+    if (version < 3) {
+      int propertyCount = in.readInt(); // read metadata
+      contentMeta.clear();
+      for (int i = 0; i < propertyCount; i++) {
+        contentMeta.add(Text.readString(in), Text.readString(in));
+      }
+    } else {
+      contentMeta.clear();
+      contentMeta.readFields(in);
+    }
+    if (version > 3) {
+      parseMeta.clear();
+      parseMeta.readFields(in);
+    }
+  }
+
+  public final void write(DataOutput out) throws IOException {
+    out.writeByte(VERSION); // write version
+    status.write(out); // write status
+    Text.writeString(out, title); // write title
+
+    out.writeInt(outlinks.length); // write outlinks
+    for (int i = 0; i < outlinks.length; i++) {
+      outlinks[i].write(out);
+    }
+    contentMeta.write(out); // write content metadata
+    parseMeta.write(out);
+  }
+
+  public static ParseData read(DataInput in) throws IOException {
+    ParseData parseText = new ParseData();
+    parseText.readFields(in);
+    return parseText;
+  }
+
+  //
+  // other methods
+  //
+
+  public boolean equals(Object o) {
+    if (!(o instanceof ParseData))
+      return false;
+    ParseData other = (ParseData) o;
+    return this.status.equals(other.status) && this.title.equals(other.title)
+        && Arrays.equals(this.outlinks, other.outlinks)
+        && this.contentMeta.equals(other.contentMeta)
+        && this.parseMeta.equals(other.parseMeta);
+  }
+
+  public String toString() {
+    StringBuffer buffer = new StringBuffer();
+
+    buffer.append("Version: " + version + "\n");
+    buffer.append("Status: " + status + "\n");
+    buffer.append("Title: " + title + "\n");
+
+    if (outlinks != null) {
+      buffer.append("Outlinks: " + outlinks.length + "\n");
+      for (int i = 0; i < outlinks.length; i++) {
+        buffer.append("  outlink: " + outlinks[i] + "\n");
+      }
+    }
+
+    buffer.append("Content Metadata: " + contentMeta + "\n");
+    buffer.append("Parse Metadata: " + parseMeta + "\n");
+
+    return buffer.toString();
+  }
+
+  public static void main(String argv[]) throws Exception {
+    String usage = "ParseData (-local | -dfs <namenode:port>) recno segment";
+
+    if (argv.length < 3) {
+      System.out.println("usage:" + usage);
+      return;
+    }
+
+    Options opts = new Options();
+    Configuration conf = NutchConfiguration.create();
+
+    GenericOptionsParser parser = new GenericOptionsParser(conf, opts, argv);
+
+    String[] remainingArgs = parser.getRemainingArgs();
+    FileSystem fs = FileSystem.get(conf);
+
+    try {
+      int recno = Integer.parseInt(remainingArgs[0]);
+      String segment = remainingArgs[1];
+
+      Path file = new Path(segment, DIR_NAME);
+      System.out.println("Reading from file: " + file);
+
+      ArrayFile.Reader parses = new ArrayFile.Reader(fs, file.toString(), 
conf);
+
+      ParseData parseDatum = new ParseData();
+      parses.get(recno, parseDatum);
+
+      System.out.println("Retrieved " + recno + " from file " + file);
+      System.out.println(parseDatum);
+
+      parses.close();
+    } finally {
+      fs.close();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java
new file mode 100644
index 0000000..3f27e33
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseException.java
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+@SuppressWarnings("serial")
+public class ParseException extends Exception {
+
+  public ParseException() {
+    super();
+  }
+
+  public ParseException(String message) {
+    super(message);
+  }
+
+  public ParseException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public ParseException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java
----------------------------------------------------------------------
diff --git a/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java
new file mode 100644
index 0000000..dc72769
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseImpl.java
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+import java.io.*;
+import org.apache.hadoop.io.*;
+
+/**
+ * The result of parsing a page's raw content.
+ * 
+ * @see Parser#getParse(Content)
+ */
+public class ParseImpl implements Parse, Writable {
+  private ParseText text;
+  private ParseData data;
+  private boolean isCanonical;
+
+  public ParseImpl() {
+  }
+
+  public ParseImpl(Parse parse) {
+    this(new ParseText(parse.getText()), parse.getData(), true);
+  }
+
+  public ParseImpl(String text, ParseData data) {
+    this(new ParseText(text), data, true);
+  }
+
+  public ParseImpl(ParseText text, ParseData data) {
+    this(text, data, true);
+  }
+
+  public ParseImpl(ParseText text, ParseData data, boolean isCanonical) {
+    this.text = text;
+    this.data = data;
+    this.isCanonical = isCanonical;
+  }
+
+  public String getText() {
+    return text.getText();
+  }
+
+  public ParseData getData() {
+    return data;
+  }
+
+  public boolean isCanonical() {
+    return isCanonical;
+  }
+
+  public final void write(DataOutput out) throws IOException {
+    out.writeBoolean(isCanonical);
+    text.write(out);
+    data.write(out);
+  }
+
+  public void readFields(DataInput in) throws IOException {
+    isCanonical = in.readBoolean();
+    text = new ParseText();
+    text.readFields(in);
+
+    data = new ParseData();
+    data.readFields(in);
+  }
+
+  public static ParseImpl read(DataInput in) throws IOException {
+    ParseImpl parseImpl = new ParseImpl();
+    parseImpl.readFields(in);
+    return parseImpl;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java
new file mode 100644
index 0000000..51b32fc
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -0,0 +1,398 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse;
+
+// Commons Logging imports
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.io.*;
+import org.apache.hadoop.io.MapFile.Writer.Option;
+import org.apache.hadoop.io.SequenceFile.CompressionType;
+import org.apache.hadoop.io.SequenceFile.Metadata;
+import org.apache.hadoop.io.compress.DefaultCodec;
+import org.apache.hadoop.fs.*;
+import org.apache.hadoop.mapred.*;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.fetcher.Fetcher;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.scoring.ScoringFilters;
+import org.apache.nutch.util.StringUtil;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.net.*;
+
+import java.io.*;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.util.Progressable;
+
+/* Parse content in a segment. */
+public class ParseOutputFormat implements OutputFormat<Text, Parse> {
+  private static final Logger LOG = LoggerFactory
+      .getLogger(ParseOutputFormat.class);
+  private URLFilters filters;
+  private URLExemptionFilters exemptionFilters;
+  private URLNormalizers normalizers;
+  private ScoringFilters scfilters;
+
+  private static class SimpleEntry implements Entry<Text, CrawlDatum> {
+    private Text key;
+    private CrawlDatum value;
+
+    public SimpleEntry(Text key, CrawlDatum value) {
+      this.key = key;
+      this.value = value;
+    }
+
+    public Text getKey() {
+      return key;
+    }
+
+    public CrawlDatum getValue() {
+      return value;
+    }
+
+    public CrawlDatum setValue(CrawlDatum value) {
+      this.value = value;
+      return this.value;
+    }
+  }
+
+  public void checkOutputSpecs(FileSystem fs, JobConf job) throws IOException {
+    Path out = FileOutputFormat.getOutputPath(job);
+    if ((out == null) && (job.getNumReduceTasks() != 0)) {
+      throw new InvalidJobConfException("Output directory not set in 
JobConf.");
+    }
+    if (fs == null) {
+      fs = out.getFileSystem(job);
+    }
+    if (fs.exists(new Path(out, CrawlDatum.PARSE_DIR_NAME)))
+      throw new IOException("Segment already parsed!");
+  }
+
+  public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job,
+      String name, Progressable progress) throws IOException {
+
+    if (job.getBoolean("parse.filter.urls", true)) {
+      filters = new URLFilters(job);
+      exemptionFilters = new URLExemptionFilters(job);
+    }
+
+    if (job.getBoolean("parse.normalize.urls", true)) {
+      normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
+    }
+
+    this.scfilters = new ScoringFilters(job);
+    final int interval = job.getInt("db.fetch.interval.default", 2592000);
+    final boolean ignoreInternalLinks = job.getBoolean(
+        "db.ignore.internal.links", false);
+    final boolean ignoreExternalLinks = job.getBoolean(
+        "db.ignore.external.links", false);
+    final String ignoreExternalLinksMode = job.get(
+        "db.ignore.external.links.mode", "byHost");
+    
+    int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
+    final boolean isParsing = job.getBoolean("fetcher.parse", true);
+    final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE
+        : maxOutlinksPerPage;
+    final CompressionType compType = SequenceFileOutputFormat
+        .getOutputCompressionType(job);
+    Path out = FileOutputFormat.getOutputPath(job);
+
+    Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
+    Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
+    Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
+
+    final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "")
+        .split(" *, *");
+
+    // textOut Options
+    Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option tValClassOpt = 
SequenceFile.Writer.valueClass(ParseText.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option tProgressOpt = 
SequenceFile.Writer.progressable(progress);
+    org.apache.hadoop.io.SequenceFile.Writer.Option tCompOpt = 
SequenceFile.Writer.compression(CompressionType.RECORD);
+    
+    final MapFile.Writer textOut = new MapFile.Writer(job, text,
+        tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt);
+    
+    // dataOut Options
+    Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option dValClassOpt = 
SequenceFile.Writer.valueClass(ParseData.class);
+    org.apache.hadoop.io.SequenceFile.Writer.Option dProgressOpt = 
SequenceFile.Writer.progressable(progress);
+    org.apache.hadoop.io.SequenceFile.Writer.Option dCompOpt = 
SequenceFile.Writer.compression(compType);
+
+    final MapFile.Writer dataOut = new MapFile.Writer(job, data,
+        dKeyClassOpt, dValClassOpt, dCompOpt, dProgressOpt);
+    
+    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(job, 
SequenceFile.Writer.file(crawl),
+        SequenceFile.Writer.keyClass(Text.class),
+        SequenceFile.Writer.valueClass(CrawlDatum.class),
+        
SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size",4096)),
+        SequenceFile.Writer.replication(fs.getDefaultReplication(crawl)),
+        SequenceFile.Writer.blockSize(1073741824),
+        SequenceFile.Writer.compression(compType, new DefaultCodec()),
+        SequenceFile.Writer.progressable(progress),
+        SequenceFile.Writer.metadata(new Metadata())); 
+
+    return new RecordWriter<Text, Parse>() {
+
+      public void write(Text key, Parse parse) throws IOException {
+
+        String fromUrl = key.toString();
+        // host or domain name of the source URL
+        String origin = null;
+        textOut.append(key, new ParseText(parse.getText()));
+
+        ParseData parseData = parse.getData();
+        // recover the signature prepared by Fetcher or ParseSegment
+        String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
+        if (sig != null) {
+          byte[] signature = StringUtil.fromHexString(sig);
+          if (signature != null) {
+            // append a CrawlDatum with a signature
+            CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
+            d.setSignature(signature);
+            crawlOut.append(key, d);
+          }
+        }
+
+        // see if the parse metadata contain things that we'd like
+        // to pass to the metadata of the crawlDB entry
+        CrawlDatum parseMDCrawlDatum = null;
+        for (String mdname : parseMDtoCrawlDB) {
+          String mdvalue = parse.getData().getParseMeta().get(mdname);
+          if (mdvalue != null) {
+            if (parseMDCrawlDatum == null)
+              parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META,
+                  0);
+            parseMDCrawlDatum.getMetaData().put(new Text(mdname),
+                new Text(mdvalue));
+          }
+        }
+        if (parseMDCrawlDatum != null)
+          crawlOut.append(key, parseMDCrawlDatum);
+
+        // need to determine origin (once for all outlinks)
+        if (ignoreExternalLinks || ignoreInternalLinks) {
+          URL originURL = new URL(fromUrl.toString());
+          // based on domain?
+          if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+            origin = URLUtil.getDomainName(originURL).toLowerCase();
+          } 
+          // use host 
+          else {
+            origin = originURL.getHost().toLowerCase();
+          }
+        }
+
+        ParseStatus pstatus = parseData.getStatus();
+        if (pstatus != null && pstatus.isSuccess()
+            && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
+          String newUrl = pstatus.getMessage();
+          int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
+          newUrl = filterNormalize(fromUrl, newUrl, origin,
+              ignoreInternalLinks, ignoreExternalLinks, 
ignoreExternalLinksMode, filters, exemptionFilters, normalizers,
+              URLNormalizers.SCOPE_FETCHER);
+
+          if (newUrl != null) {
+            String reprUrl = URLUtil.chooseRepr(fromUrl, newUrl,
+                refreshTime < Fetcher.PERM_REFRESH_TIME);
+            CrawlDatum newDatum = new CrawlDatum();
+            newDatum.setStatus(CrawlDatum.STATUS_LINKED);
+            if (reprUrl != null && !reprUrl.equals(newUrl)) {
+              newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY,
+                  new Text(reprUrl));
+            }
+            crawlOut.append(new Text(newUrl), newDatum);
+          }
+        }
+
+        // collect outlinks for subsequent db update
+        Outlink[] links = parseData.getOutlinks();
+        int outlinksToStore = Math.min(maxOutlinks, links.length);
+
+        int validCount = 0;
+        CrawlDatum adjust = null;
+        List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, 
CrawlDatum>>(
+            outlinksToStore);
+        List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
+        for (int i = 0; i < links.length && validCount < outlinksToStore; i++) 
{
+          String toUrl = links[i].getToUrl();
+
+          // Only normalize and filter if fetcher.parse = false
+          if (!isParsing) {
+            toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin,
+                ignoreInternalLinks, ignoreExternalLinks, 
ignoreExternalLinksMode, filters, exemptionFilters, normalizers);
+            if (toUrl == null) {
+              continue;
+            }
+          }
+
+          CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, 
interval);
+          Text targetUrl = new Text(toUrl);
+
+          // see if the outlink has any metadata attached
+          // and if so pass that to the crawldatum so that
+          // the initial score or distribution can use that
+          MapWritable outlinkMD = links[i].getMetadata();
+          if (outlinkMD != null) {
+            target.getMetaData().putAll(outlinkMD);
+          }
+
+          try {
+            scfilters.initialScore(targetUrl, target);
+          } catch (ScoringFilterException e) {
+            LOG.warn("Cannot filter init score for url " + key
+                + ", using default: " + e.getMessage());
+            target.setScore(0.0f);
+          }
+
+          targets.add(new SimpleEntry(targetUrl, target));
+
+          // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174)
+          links[i].setUrl(toUrl);
+          outlinkList.add(links[i]);
+          validCount++;
+        }
+
+        try {
+          // compute score contributions and adjustment to the original score
+          adjust = scfilters.distributeScoreToOutlinks(key, parseData, targets,
+              null, links.length);
+        } catch (ScoringFilterException e) {
+          LOG.warn("Cannot distribute score from " + key + ": "
+              + e.getMessage());
+        }
+        for (Entry<Text, CrawlDatum> target : targets) {
+          crawlOut.append(target.getKey(), target.getValue());
+        }
+        if (adjust != null)
+          crawlOut.append(key, adjust);
+
+        Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList
+            .size()]);
+        parseData = new ParseData(parseData.getStatus(), parseData.getTitle(),
+            filteredLinks, parseData.getContentMeta(), 
parseData.getParseMeta());
+        dataOut.append(key, parseData);
+        if (!parse.isCanonical()) {
+          CrawlDatum datum = new CrawlDatum();
+          datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
+          String timeString = parse.getData().getContentMeta()
+              .get(Nutch.FETCH_TIME_KEY);
+          try {
+            datum.setFetchTime(Long.parseLong(timeString));
+          } catch (Exception e) {
+            LOG.warn("Can't read fetch time for: " + key);
+            datum.setFetchTime(System.currentTimeMillis());
+          }
+          crawlOut.append(key, datum);
+        }
+      }
+
+      public void close(Reporter reporter) throws IOException {
+        textOut.close();
+        dataOut.close();
+        crawlOut.close();
+      }
+
+    };
+
+  }
+
+  public static String filterNormalize(String fromUrl, String toUrl,
+      String fromHost, boolean ignoreInternalLinks, boolean 
ignoreExternalLinks,
+      String ignoreExternalLinksMode, URLFilters filters, URLExemptionFilters 
exemptionFilters,
+      URLNormalizers normalizers) {
+    return filterNormalize(fromUrl, toUrl, fromHost, ignoreInternalLinks, 
ignoreExternalLinks,
+        ignoreExternalLinksMode, filters, exemptionFilters, normalizers,
+        URLNormalizers.SCOPE_OUTLINK);
+  }
+
+  public static String filterNormalize(String fromUrl, String toUrl,
+      String origin, boolean ignoreInternalLinks, boolean ignoreExternalLinks,
+       String ignoreExternalLinksMode, URLFilters filters,
+       URLExemptionFilters exemptionFilters, URLNormalizers normalizers,
+        String urlNormalizerScope) {
+    // ignore links to self (or anchors within the page)
+    if (fromUrl.equals(toUrl)) {
+      return null;
+    }
+    if (ignoreExternalLinks || ignoreInternalLinks) {
+      URL targetURL = null;
+      try {
+        targetURL = new URL(toUrl);
+      } catch (MalformedURLException e1) {
+        return null; // skip it
+      }
+      if (ignoreExternalLinks) {
+        if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+          String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
+          //FIXME: toDomain will never be null, correct?
+          if (toDomain == null || !toDomain.equals(origin)) {
+            return null; // skip it
+          }
+        } else {
+          String toHost = targetURL.getHost().toLowerCase();
+          if (!toHost.equals(origin)) { // external host link
+            if (exemptionFilters == null // check if it is exempted?
+                || !exemptionFilters.isExempted(fromUrl, toUrl)) {
+              return null; ///skip it, This external url is not exempted.
+            }
+          }
+        }
+      }
+      if (ignoreInternalLinks) {
+        if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
+          String toDomain = URLUtil.getDomainName(targetURL).toLowerCase();
+          //FIXME: toDomain will never be null, correct?
+          if (toDomain == null || toDomain.equals(origin)) {
+            return null; // skip it
+          }
+        } else {
+          String toHost = targetURL.getHost().toLowerCase();
+          //FIXME: toDomain will never be null, correct?
+          if (toHost == null || toHost.equals(origin)) {
+            return null; // skip it
+          }
+        }
+      }
+    }
+
+    try {
+      if (normalizers != null) {
+        toUrl = normalizers.normalize(toUrl, urlNormalizerScope); // normalize
+                                                                  // the url
+      }
+      if (filters != null) {
+        toUrl = filters.filter(toUrl); // filter the url
+      }
+      if (toUrl == null) {
+        return null;
+      }
+    } catch (Exception e) {
+      return null;
+    }
+
+    return toUrl;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java
----------------------------------------------------------------------
diff --git 
a/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java 
b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java
new file mode 100644
index 0000000..6ad0ac8
--- /dev/null
+++ b/nutch-core/src/main/java/org/apache/nutch/parse/ParsePluginList.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+// JDK imports
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * This class represents a natural ordering for which parsing plugin should get
+ * called for a particular mimeType. It provides methods to store the
+ * parse-plugins.xml data, and methods to retreive the name of the appropriate
+ * parsing plugin for a contentType.
+ * 
+ * @author mattmann
+ * @version 1.0
+ */
+class ParsePluginList {
+
+  /* a map to link mimeType to an ordered list of parsing plugins */
+  private Map<String, List<String>> fMimeTypeToPluginMap = null;
+
+  /* A list of aliases */
+  private Map<String, String> aliases = null;
+
+  /**
+   * Constructs a new ParsePluginList
+   */
+  ParsePluginList() {
+    fMimeTypeToPluginMap = new HashMap<String, List<String>>();
+    aliases = new HashMap<String, String>();
+  }
+
+  List<String> getPluginList(String mimeType) {
+    return fMimeTypeToPluginMap.get(mimeType);
+  }
+
+  void setAliases(Map<String, String> aliases) {
+    this.aliases = aliases;
+  }
+
+  Map<String, String> getAliases() {
+    return aliases;
+  }
+
+  void setPluginList(String mimeType, List<String> l) {
+    fMimeTypeToPluginMap.put(mimeType, l);
+  }
+
+  List<String> getSupportedMimeTypes() {
+    return Arrays
+        .asList(fMimeTypeToPluginMap.keySet().toArray(new String[] {}));
+  }
+
+}

[43/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

Reply via email to