nutc...

ab Thu, 29 Dec 2005 07:29:35 -0800

Author: ab
Date: Thu Dec 29 07:28:30 2005
New Revision: 359822

URL: http://svn.apache.org/viewcvs?rev=359822&view=rev
Log:
A framework for using different page signature implementations. Ordinary
MD5 hash of a raw page content is very often unsuitable, when many
near-duplicate pages are crawled.


Now users can select their own page signature implementation, possibly
with better properties than the old one.

Two implementations are provided:

* MD5Signature: backward-compatible with the old schema.

* TextProfileSignature: an example implementation of a signature, which
  gives the same values for near-duplicate pages. Please see Javadoc for
  more information.

This commit changes the CrawlDatum to store page signatures in CrawlDb.
Last modified time field was added, too. Both changes are in preparation
for patches implementing self-adjustable fetch interval.

NutchConf was extended to store and retrieve also plain Object values.
This is useful when caching per-job instances.

StringUtil: added methods to display / parse byte[] values.

Added SegmentReader (based on a contribution in NUTCH-121 by Rod Taylor).

Fixed Fetcher to actually use the command-line parameters.

Added:
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java   
(with props)
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java   (with 
props)
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java 
  (with props)
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java   
(with props)
    
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java   
(with props)
    lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java   
(with props)
Modified:
    lucene/nutch/trunk/bin/nutch
    lucene/nutch/trunk/conf/nutch-default.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
    lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
    lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java
    lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
    
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java

Modified: lucene/nutch/trunk/bin/nutch
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/bin/nutch?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Thu Dec 29 07:28:30 2005
@@ -35,6 +35,7 @@
   echo "  generate          generate new segments to fetch"
   echo "  fetch             fetch a segment's pages"
   echo "  parse             parse a segment's pages"
+  echo "  segread           read / dump segment data"
   echo "  updatedb          update crawl db from segments after fetching"
   echo "  invertlinks       create a linkdb from parsed segments"
   echo "  index             run the indexer on parsed segments and linkdb"
@@ -137,6 +138,8 @@
   CLASS=org.apache.nutch.crawl.CrawlDbReader
 elif [ "$COMMAND" = "readlinkdb" ] ; then
   CLASS=org.apache.nutch.crawl.LinkDbReader
+elif [ "$COMMAND" = "segread" ] ; then
+  CLASS=org.apache.nutch.segment.SegmentReader
 elif [ "$COMMAND" = "updatedb" ] ; then
   CLASS=org.apache.nutch.crawl.CrawlDb
 elif [ "$COMMAND" = "invertlinks" ] ; then

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/conf/nutch-default.xml?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Thu Dec 29 07:28:30 2005
@@ -262,6 +262,31 @@
   recoverable errors is generated for fetch.</description>
 </property>
 
+<property>
+  <name>db.signature.class</name>
+  <value>org.apache.nutch.crawl.MD5Signature</value>
+  <description>The default implementation of a page signature. Signatures
+  created with this implementation will be used for duplicate detection
+  and removal.</description>
+</property>
+
+<property>
+  <name>db.signature.text_profile.min_token_len</name>
+  <value>2</value>
+  <description>Minimum token length to be included in the signature.
+  </description>
+</property>
+
+<property>
+  <name>db.signature.text_profile.quant_rate</name>
+  <value>0.01</value>
+  <description>Profile frequencies will be rounded down to a multiple of
+  QUANT = (int)(QUANT_RATE * maxFreq), where maxFreq is a maximum token
+  frequency. If maxFreq > 1 then QUANT will be at least 2, which means that
+  for longer texts tokens with frequency 1 will always be discarded.
+  </description>
+</property>
+
 <!-- generate properties -->
 
 <property>

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Thu Dec 
29 07:28:30 2005
@@ -31,8 +31,9 @@
   public static final String FETCH_DIR_NAME = "crawl_fetch";
   public static final String PARSE_DIR_NAME = "crawl_parse";
 
-  private final static byte CUR_VERSION = 2;
+  private final static byte CUR_VERSION = 3;
 
+  public static final byte STATUS_SIGNATURE = 0;
   public static final byte STATUS_DB_UNFETCHED = 1;
   public static final byte STATUS_DB_FETCHED = 2;
   public static final byte STATUS_DB_GONE = 3;
@@ -42,7 +43,7 @@
   public static final byte STATUS_FETCH_GONE = 7;
   
   public static final String[] statNames = {
-    "INVALID",
+    "signature",
     "DB_unfetched",
     "DB_fetched",
     "DB_gone",
@@ -59,6 +60,8 @@
   private byte retries;
   private float fetchInterval;
   private float score = 1.0f;
+  private byte[] signature = null;
+  private long modifiedTime;
 
   public CrawlDatum() {}
 
@@ -86,6 +89,14 @@
     fetchTime += (long)(MILLISECONDS_PER_DAY*fetchInterval);
   }
 
+  public long getModifiedTime() {
+    return modifiedTime;
+  }
+
+  public void setModifiedTime(long modifiedTime) {
+    this.modifiedTime = modifiedTime;
+  }
+  
   public byte getRetriesSinceFetch() { return retries; }
   public void setRetriesSinceFetch(int retries) {this.retries = (byte)retries;}
 
@@ -97,6 +108,16 @@
   public float getScore() { return score; }
   public void setScore(float score) { this.score = score; }
 
+  public byte[] getSignature() {
+    return signature;
+  }
+
+  public void setSignature(byte[] signature) {
+    if (signature != null && signature.length > 256)
+      throw new RuntimeException("Max signature length (256) exceeded: " + 
signature.length);
+    this.signature = signature;
+  }
+
   //
   // writable methods
   //
@@ -110,7 +131,7 @@
 
   public void readFields(DataInput in) throws IOException {
     byte version = in.readByte();                 // read version
-    if (version != CUR_VERSION)                   // check version
+    if (version > CUR_VERSION)                   // check version
       throw new VersionMismatchException(CUR_VERSION, version);
 
     status = in.readByte();
@@ -118,10 +139,19 @@
     retries = in.readByte();
     fetchInterval = in.readFloat();
     score = in.readFloat();
+    if (version > 2) {
+      modifiedTime = in.readLong();
+      int cnt = in.readByte();
+      if (cnt > 0) {
+        signature = new byte[cnt];
+        in.readFully(signature);
+      } else signature = null;
+    }
   }
 
   /** The number of bytes into a CrawlDatum that the score is stored. */
   private static final int SCORE_OFFSET = 1 + 1 + 8 + 1 + 4;
+  private static final int SIG_OFFSET = SCORE_OFFSET + 4 + 8;
 
   public void write(DataOutput out) throws IOException {
     out.writeByte(CUR_VERSION);                   // store current version
@@ -130,6 +160,13 @@
     out.writeByte(retries);
     out.writeFloat(fetchInterval);
     out.writeFloat(score);
+    out.writeLong(modifiedTime);
+    if (signature == null) {
+      out.writeByte(0);
+    } else {
+      out.writeByte(signature.length);
+      out.write(signature);
+    }
   }
 
   /** Copy the contents of another instance into this instance. */
@@ -139,6 +176,8 @@
     this.retries = that.retries;
     this.fetchInterval = that.fetchInterval;
     this.score = that.score;
+    this.modifiedTime = that.modifiedTime;
+    this.signature = that.signature;
   }
 
 
@@ -159,7 +198,9 @@
       return that.retries - this.retries;
     if (that.fetchInterval != this.fetchInterval)
       return (that.fetchInterval - this.fetchInterval) > 0 ? 1 : -1;
-    return 0;
+    if (that.modifiedTime != this.modifiedTime)
+      return (that.modifiedTime - this.modifiedTime) > 0 ? 1 : -1;
+    return SignatureComparator._compare(this, that);
   }
 
   /** A Comparator optimized for CrawlDatum. */ 
@@ -188,7 +229,13 @@
       float fetchInterval2 = readFloat(b2, s2+1+1+8+1);
       if (fetchInterval2 != fetchInterval1)
         return (fetchInterval2 - fetchInterval1) > 0 ? 1 : -1;
-      return 0;
+      long modifiedTime1 = readLong(b1, s1 + SCORE_OFFSET + 4);
+      long modifiedTime2 = readLong(b2, s2 + SCORE_OFFSET + 4);
+      if (modifiedTime2 != modifiedTime1)
+        return (modifiedTime2 - modifiedTime1) > 0 ? 1 : -1;
+      int sigl1 = b1[s1+SIG_OFFSET];
+      int sigl2 = b2[s2+SIG_OFFSET];
+      return SignatureComparator._compare(b1, SIG_OFFSET, sigl1, b2, 
SIG_OFFSET, sigl2);
     }
   }
 
@@ -206,9 +253,11 @@
     buf.append("Version: " + CUR_VERSION + "\n");
     buf.append("Status: " + getStatus() + " (" + statNames[getStatus()] + 
")\n");
     buf.append("Fetch time: " + new Date(getFetchTime()) + "\n");
+    buf.append("Modified time: " + new Date(getModifiedTime()) + "\n");
     buf.append("Retries since fetch: " + getRetriesSinceFetch() + "\n");
     buf.append("Retry interval: " + getFetchInterval() + " days\n");
     buf.append("Score: " + getScore() + "\n");
+    buf.append("Signature: " + StringUtil.toHexString(getSignature()) + "\n");
     return buf.toString();
   }
 
@@ -219,15 +268,25 @@
     return
       (this.status == other.status) &&
       (this.fetchTime == other.fetchTime) &&
+      (this.modifiedTime == other.modifiedTime) &&
       (this.retries == other.retries) &&
       (this.fetchInterval == other.fetchInterval) &&
+      (SignatureComparator._compare(this.signature, other.signature) == 0) &&
       (this.score == other.score);
   }
 
   public int hashCode() {
+    int res = 0;
+    if (signature != null) {
+      for (int i = 0; i < signature.length / 4; i += 4) {
+        res ^= (int)(signature[i] << 24 + signature[i+1] << 16 +
+                signature[i+2] << 8 + signature[i+3]);
+      }
+    }
     return
-      status ^
+      res ^ status ^
       ((int)fetchTime) ^
+      ((int)modifiedTime) ^
       retries ^
       Float.floatToIntBits(fetchInterval) ^
       Float.floatToIntBits(score);
@@ -240,5 +299,4 @@
       throw new RuntimeException(e);
     }
   }
-
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReducer.java Thu 
Dec 29 07:28:30 2005
@@ -38,6 +38,7 @@
 
     CrawlDatum highest = null;
     CrawlDatum old = null;
+    byte[] signature = null;
     float scoreIncrement = 0.0f;
 
     while (values.hasNext()) {
@@ -55,6 +56,8 @@
       case CrawlDatum.STATUS_LINKED:
         scoreIncrement += datum.getScore();
         break;
+      case CrawlDatum.STATUS_SIGNATURE:
+        signature = datum.getSignature();
       }
     }
 
@@ -76,16 +79,20 @@
         result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
         result.setScore(1.0f);                    // initial score is 1.0f
       }
+      result.setSignature(null);                  // reset the signature
       break;
       
     case CrawlDatum.STATUS_FETCH_SUCCESS:         // succesful fetch
       result = highest;                           // use new entry
+      if (highest.getSignature() == null) highest.setSignature(signature);
       result.setStatus(CrawlDatum.STATUS_DB_FETCHED);
       result.setNextFetchTime();
       break;
 
     case CrawlDatum.STATUS_FETCH_RETRY:           // temporary failure
       result = highest;                           // use new entry
+      if (old != null)
+        result.setSignature(old.getSignature());  // use old signature
       if (highest.getRetriesSinceFetch() < retryMax) {
         result.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
       } else {
@@ -95,6 +102,8 @@
 
     case CrawlDatum.STATUS_FETCH_GONE:            // permanent failure
       result = highest;                           // use new entry
+      if (old != null)
+        result.setSignature(old.getSignature());  // use old signature
       result.setStatus(CrawlDatum.STATUS_DB_GONE);
       break;
 

Added: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java?rev=359822&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java Thu 
Dec 29 07:28:30 2005
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.nutch.io.MD5Hash;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * Default implementation of a page signature. It calculates an MD5 hash
+ * of the raw binary content of a page. In case there is no content, it
+ * calculates a hash from the page's URL.
+ * 
+ * @author Andrzej Bialecki &lt;[EMAIL PROTECTED]&gt;
+ */
+public class MD5Signature extends Signature {
+
+  public byte[] calculate(Content content, Parse parse) {
+    byte[] data = content.getContent();
+    if (data == null) data = content.getUrl().getBytes();
+    return MD5Hash.digest(data).getDigest();
+  }
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/MD5Signature.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java?rev=359822&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java (added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java Thu Dec 
29 07:28:30 2005
@@ -0,0 +1,36 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.NutchConfigurable;
+
+public abstract class Signature implements NutchConfigurable {
+  protected NutchConf conf;
+  
+  public abstract byte[] calculate(Content content, Parse parse);
+
+  public NutchConf getConf() {
+    return conf;
+  }
+
+  public void setConf(NutchConf conf) {
+    this.conf = conf;
+  }
+}

Propchange: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/Signature.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java?rev=359822&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java 
Thu Dec 29 07:28:30 2005
@@ -0,0 +1,47 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.Comparator;
+
+public class SignatureComparator implements Comparator {
+  public int compare(Object o1, Object o2) {
+    return _compare(o1, o2);
+  }
+  
+  public static int _compare(Object o1, Object o2) {
+    if (o1 == null && o2 == null) return 0;
+    if (o1 == null) return -1;
+    if (o2 == null) return 1;
+    if (!(o1 instanceof byte[])) return -1;
+    if (!(o2 instanceof byte[])) return 1;
+    byte[] data1 = (byte[])o1;
+    byte[] data2 = (byte[])o2;
+    return _compare(data1, 0, data1.length, data2, 0, data2.length);
+  }
+  
+  public static int _compare(byte[] data1, int s1, int l1, byte[] data2, int 
s2, int l2) {
+    if (l2 > l1) return -1;
+    if (l2 < l1) return 1;
+    int res = 0;
+    for (int i = 0; i < l1; i++) {
+      res = (data1[s1 + i] - data2[s2 + i]);
+      if (res != 0) return res;
+    }
+    return 0;
+  }
+}

Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java?rev=359822&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java 
Thu Dec 29 07:28:30 2005
@@ -0,0 +1,53 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.logging.Logger;
+
+import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
+
+/**
+ * Factory class, which instantiates a Signature implementation according to 
the
+ * current NutchConf configuration. This newly created instance is cached in 
the
+ * NutchConf instance, so that it could be later retrieved.
+ * 
+ * @author Andrzej Bialecki &lt;[EMAIL PROTECTED]&gt;
+ */
+public class SignatureFactory {
+  private static final Logger LOG =
+    LogFormatter.getLogger(SignatureFactory.class.getName());
+
+  private SignatureFactory() {}                   // no public ctor
+
+  /** Return the default Signature implementation. */
+  public static Signature getSignature(NutchConf conf) {
+    String clazz = conf.get("db.signature.class", 
MD5Signature.class.getName());
+    Signature impl = (Signature)conf.getObject(clazz);
+    if (impl == null) {
+      try {
+        LOG.info("Using Signature impl: " + clazz);
+        Class implClass = Class.forName(clazz);
+        impl = (Signature)implClass.newInstance();
+        impl.setConf(conf);
+      } catch (Exception e) {
+        throw new RuntimeException("Couldn't create " + clazz, e);
+      }
+    }
+    return impl;
+  }
+}

Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: 
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java?rev=359822&view=auto
==============================================================================
--- 
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java 
(added)
+++ 
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java 
Thu Dec 29 07:28:30 2005
@@ -0,0 +1,183 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+
+import org.apache.nutch.io.MD5Hash;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseImpl;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.StringUtil;
+
+/**
+ * <p>An implementation of a page signature. It calculates an MD5 hash
+ * of a plain text "profile" of a page. In case there is no text, it
+ * calculates a hash using the [EMAIL PROTECTED] MD5Signature}.</p>
+ * <p>The algorithm to calculate a page "profile" takes the plain text version 
of
+ * a page and performs the following steps:
+ * <ul>
+ * <li>remove all characters except letters and digits, and bring all 
characters
+ * to lower case,</li>
+ * <li>split the text into tokens (all consecutive non-whitespace 
characters),</li>
+ * <li>discard tokens equal or shorter than MIN_TOKEN_LEN (default 2 
characters),</li>
+ * <li>sort the list of tokens by decreasing frequency,</li>
+ * <li>round down the counts of tokens to the nearest multiple of QUANT
+ * (<code>QUANT = QUANT_RATE * maxFreq</code>, where <code>QUANT_RATE</code> 
is 0.01f
+ * by default, and <code>maxFreq</code> is the maximum token frequency). If
+ * <code>maxFreq</code> is higher than 1, then QUANT is always higher than 2 
(which
+ * means that tokens with frequency 1 are always discarded).</li>
+ * <li>tokens, which frequency after quantization falls below QUANT, are 
discarded.</li>
+ * <li>create a list of tokens and their quantized frequency, separated by 
spaces,
+ * in the order of decreasing frequency.</li>
+ * </ul>
+ * This list is then submitted to an MD5 hash calculation.
+ * 
+ * @author Andrzej Bialecki &lt;[EMAIL PROTECTED]&gt;
+ */
+public class TextProfileSignature extends Signature {
+  
+  Signature fallback = new MD5Signature();
+
+  public byte[] calculate(Content content, Parse parse) {
+    int MIN_TOKEN_LEN = 
getConf().getInt("db.signature.text_profile.min_token_len", 2);
+    float QUANT_RATE = 
getConf().getFloat("db.signature.text_profile.quant_rate", 0.01f);
+    HashMap tokens = new HashMap();
+    String text = null;
+    if (parse != null) text = parse.getText();
+    if (text == null || text.length() == 0) return fallback.calculate(content, 
parse);
+    StringBuffer curToken = new StringBuffer();
+    int maxFreq = 0;
+    for (int i = 0; i < text.length(); i++) {
+      char c = text.charAt(i);
+      if (Character.isLetterOrDigit(c)) {
+        curToken.append(Character.toLowerCase(c));
+      } else {
+        if (curToken.length() > 0) {
+          if (curToken.length() > MIN_TOKEN_LEN) {
+            // add it
+            String s = curToken.toString();
+            Token tok = (Token)tokens.get(s);
+            if (tok == null) {
+              tok = new Token(0, s);
+              tokens.put(s, tok);
+            }
+            tok.cnt++;
+            if (tok.cnt > maxFreq) maxFreq = tok.cnt;
+          }
+          curToken.setLength(0);
+        }
+      }
+    }
+    // check the last token
+    if (curToken.length() > MIN_TOKEN_LEN) {
+      // add it
+      String s = curToken.toString();
+      Token tok = (Token)tokens.get(s);
+      if (tok == null) {
+        tok = new Token(0, s);
+        tokens.put(s, tok);
+      }
+      tok.cnt++;
+      if (tok.cnt > maxFreq) maxFreq = tok.cnt;
+    }
+    Iterator it = tokens.values().iterator();
+    ArrayList profile = new ArrayList();
+    // calculate the QUANT value
+    int QUANT = Math.round(maxFreq * QUANT_RATE);
+    if (QUANT < 2) {
+      if (maxFreq > 1) QUANT = 2;
+      else QUANT = 1;
+    }
+    while(it.hasNext()) {
+      Token t = (Token)it.next();
+      // round down to the nearest QUANT
+      t.cnt = (t.cnt / QUANT) * QUANT;
+      // discard the frequencies below the QUANT
+      if (t.cnt < QUANT) {
+        continue;
+      }
+      profile.add(t);
+    }
+    Collections.sort(profile, new TokenComparator());
+    StringBuffer newText = new StringBuffer();
+    it = profile.iterator();
+    while (it.hasNext()) {
+      Token t = (Token)it.next();
+      if (newText.length() > 0) newText.append("\n");
+      newText.append(t.toString());
+    }
+    return MD5Hash.digest(newText.toString()).getDigest();
+  }
+  
+  private static class Token {
+    public int cnt;
+    public String val;
+    
+    public Token(int cnt, String val) {
+      this.cnt = cnt;
+      this.val = val;
+    }
+    
+    public String toString() {
+      return val + " " + cnt;
+    }
+  }
+  
+  private static class TokenComparator implements Comparator {
+    public int compare(Object o1, Object o2) {
+      Token t1 = (Token)o1;
+      Token t2 = (Token)o2;
+      return t2.cnt - t1.cnt;
+    }
+  }
+  
+  public static void main(String[] args) throws Exception {
+    TextProfileSignature sig = new TextProfileSignature();
+    sig.setConf(NutchConf.get());
+    HashMap res = new HashMap();
+    File[] files = new File(args[0]).listFiles();
+    for (int i = 0; i < files.length; i++) {
+      FileInputStream fis = new FileInputStream(files[i]);
+      BufferedReader br = new BufferedReader(new InputStreamReader(fis, 
"UTF-8"));
+      StringBuffer text = new StringBuffer();
+      String line = null;
+      while ((line = br.readLine()) != null) {
+        if (text.length() > 0) text.append("\n");
+        text.append(line);
+      }
+      br.close();
+      byte[] signature = sig.calculate(null, new ParseImpl(text.toString(), 
null));
+      res.put(files[i].toString(), signature);
+    }
+    Iterator it = res.keySet().iterator();
+    while (it.hasNext()) {
+      String name = (String)it.next();
+      byte[] signature = (byte[])res.get(name);
+      System.out.println(name + "\t" + StringUtil.toHexString(signature));
+    }
+  }
+}

Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/crawl/TextProfileSignature.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Dec 
29 07:28:30 2005
@@ -21,6 +21,7 @@
 
 import org.apache.nutch.io.*;
 import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.SignatureFactory;
 import org.apache.nutch.fs.*;
 import org.apache.nutch.net.*;
 import org.apache.nutch.util.*;
@@ -36,7 +37,7 @@
   public static final Logger LOG =
     LogFormatter.getLogger("org.apache.nutch.fetcher.Fetcher");
   
-  public static final String DIGEST_KEY = "nutch.content.digest";
+  public static final String SIGNATURE_KEY = "nutch.content.digest";
   public static final String SEGMENT_NAME_KEY = "nutch.segment.name";
   public static final String SCORE_KEY = "nutch.crawl.score";
 
@@ -93,6 +94,7 @@
               break;                              // at eof, exit
             }
           } catch (IOException e) {
+            e.printStackTrace();
             LOG.severe("fetcher caught:"+e.toString());
             break;
           }
@@ -173,6 +175,7 @@
         }
 
       } catch (Throwable e) {
+        e.printStackTrace();
         LOG.severe("fetcher caught:"+e.toString());
       } finally {
         synchronized (Fetcher.this) {activeThreads--;} // count threads
@@ -193,11 +196,9 @@
 
       if (content == null) {
         String url = key.toString();
-        content = new Content(url,url,new byte[0],"",new ContentProperties());
+        content = new Content(url, url, new byte[0], "", new 
ContentProperties());
       }
 
-      content.getMetadata().setProperty           // add digest to metadata
-        (DIGEST_KEY, MD5Hash.digest(content.getContent()).toString());
       content.getMetadata().setProperty           // add segment to metadata
         (SEGMENT_NAME_KEY, segmentName);
       content.getMetadata().setProperty           // add score to metadata
@@ -213,9 +214,14 @@
           parseStatus = new ParseStatus(e);
         }
         if (!parseStatus.isSuccess()) {
-          LOG.warning("Error parsing: "+key+": "+parseStatus);
-          parse = null;
+          LOG.warning("Error parsing: " + key + ": " + parseStatus);
+          parse = parseStatus.getEmptyParse();
         }
+        // Calculate page signature. For non-parsing fetchers this will
+        // be done in ParseSegment
+        byte[] signature = 
SignatureFactory.getSignature(getConf()).calculate(content, parse);
+        parse.getData().getMetadata().setProperty(SIGNATURE_KEY, 
StringUtil.toHexString(signature));
+        datum.setSignature(signature);
       }
 
       try {
@@ -225,6 +231,7 @@
                              storingContent ? content : null,
                              parse != null ? new ParseImpl(parse) : null));
       } catch (IOException e) {
+        e.printStackTrace();
         LOG.severe("fetcher caught:"+e.toString());
       }
     }
@@ -310,7 +317,7 @@
     
   }
 
-  public void fetch(File segment, int threads)
+  public void fetch(File segment, int threads, boolean parsing)
     throws IOException {
 
     LOG.info("Fetcher: starting");
@@ -320,6 +327,7 @@
 
     job.setInt("fetcher.threads.fetch", threads);
     job.set(SEGMENT_NAME_KEY, segment.getName());
+    job.setBoolean("fetcher.parse", parsing);
 
     job.setInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME));
     job.setInputFormat(InputFormat.class);
@@ -341,7 +349,7 @@
   /** Run the fetcher. */
   public static void main(String[] args) throws Exception {
 
-    String usage = "Usage: Fetcher <segment> [-threads n]";
+    String usage = "Usage: Fetcher <segment> [-threads n] [-noParsing]";
 
     if (args.length < 1) {
       System.err.println(usage);
@@ -353,16 +361,21 @@
     NutchConf conf = NutchConf.get();
 
     int threads = conf.getInt("fetcher.threads.fetch", 10);
+    boolean parsing = true;
 
     for (int i = 1; i < args.length; i++) {       // parse command line
       if (args[i].equals("-threads")) {           // found -threads option
         threads =  Integer.parseInt(args[++i]);
-      }
+      } else if (args[i].equals("-noParsing")) parsing = false;
     }
 
+    conf.setInt("fetcher.threads.fetch", threads);
+    if (!parsing) {
+      conf.setBoolean("fetcher.parse", parsing);
+    }
     Fetcher fetcher = new Fetcher(conf);          // make a Fetcher
     
-    fetcher.fetch(segment, threads);              // run the Fetcher
+    fetcher.fetch(segment, threads, parsing);              // run the Fetcher
 
   }
 }

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/Indexer.java Thu Dec 
29 07:28:30 2005
@@ -199,7 +199,7 @@
                             meta.getProperty(Fetcher.SEGMENT_NAME_KEY)));
 
     // add digest, used by dedup
-    doc.add(Field.UnIndexed("digest", meta.getProperty(Fetcher.DIGEST_KEY)));
+    doc.add(Field.UnIndexed("digest", 
meta.getProperty(Fetcher.SIGNATURE_KEY)));
 
     // boost is opic
     float boost = (float)Math.pow(dbDatum.getScore(), scorePower);

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
Thu Dec 29 07:28:30 2005
@@ -22,6 +22,7 @@
 import org.apache.nutch.fs.*;
 import org.apache.nutch.mapred.*;
 import org.apache.nutch.parse.*;
+import org.apache.nutch.util.StringUtil;
 import org.apache.nutch.net.*;
 
 import java.io.*;
@@ -63,6 +64,18 @@
           
           textOut.append(key, new ParseText(parse.getText()));
           dataOut.append(key, parse.getData());
+          
+          // recover the signature prepared by Fetcher or ParseSegment
+          String sig = 
parse.getData().getMetadata().getProperty(Fetcher.SIGNATURE_KEY);
+          if (sig != null) {
+            byte[] signature = StringUtil.fromHexString(sig);
+            if (signature != null) {
+              // append a CrawlDatum with a signature
+              CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0.0f);
+              d.setSignature(signature);
+              crawlOut.append(key, d);
+            }
+          }
 
           // collect outlinks for subsequent db update
           Outlink[] links = parse.getData().getOutlinks();

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Thu 
Dec 29 07:28:30 2005
@@ -16,6 +16,8 @@
 
 package org.apache.nutch.parse;
 
+import org.apache.nutch.crawl.SignatureFactory;
+import org.apache.nutch.fetcher.Fetcher;
 import org.apache.nutch.io.*;
 import org.apache.nutch.parse.ParseOutputFormat;
 import org.apache.nutch.mapred.*;
@@ -32,13 +34,14 @@
   public static final Logger LOG =
     LogFormatter.getLogger(Parser.class.getName());
 
-  public ParseSegment() { super(null); }
+  public ParseSegment() { super(NutchConf.get()); }
 
   public ParseSegment(NutchConf conf) {
     super(conf);
   }
 
   public void configure(JobConf job) {
+    setConf(job);
   }
 
   public void map(WritableComparable key, Writable value,
@@ -55,6 +58,9 @@
       status = new ParseStatus(e);
     }
 
+    // compute the new signature
+    byte[] signature = 
SignatureFactory.getSignature(getConf()).calculate(content, parse);
+    parse.getData().getMetadata().setProperty(Fetcher.SIGNATURE_KEY, 
StringUtil.toHexString(signature));
     if (status.isSuccess()) {
       output.collect(key, new ParseImpl(parse.getText(), parse.getData()));
     } else {

Added: lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=359822&view=auto
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java 
(added)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Thu 
Dec 29 07:28:30 2005
@@ -0,0 +1,232 @@
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.segment;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.PrintStream;
+import java.io.PrintWriter;
+import java.util.Iterator;
+import java.util.logging.Logger;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.fs.NutchFileSystem;
+import org.apache.nutch.io.ObjectWritable;
+import org.apache.nutch.io.UTF8;
+import org.apache.nutch.io.Writable;
+import org.apache.nutch.io.WritableComparable;
+import org.apache.nutch.mapred.FileSplit;
+import org.apache.nutch.mapred.JobClient;
+import org.apache.nutch.mapred.JobConf;
+import org.apache.nutch.mapred.OutputCollector;
+import org.apache.nutch.mapred.RecordReader;
+import org.apache.nutch.mapred.RecordWriter;
+import org.apache.nutch.mapred.Reducer;
+import org.apache.nutch.mapred.Reporter;
+import org.apache.nutch.mapred.SequenceFileInputFormat;
+import org.apache.nutch.mapred.SequenceFileRecordReader;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseText;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.util.LogFormatter;
+import org.apache.nutch.util.NutchConf;
+import org.apache.nutch.util.NutchConfigured;
+
+/** Dump the content of a segment. */
+public class SegmentReader extends NutchConfigured implements Reducer {
+
+  public static final String DIR_NAME = "segdump";
+
+  public static final Logger LOG =
+    LogFormatter.getLogger(SegmentReader.class.getName());
+
+  long recNo = 0L;
+
+  /** Wraps inputs in an [EMAIL PROTECTED] ObjectWritable}, to permit merging 
different
+   * types in reduce. */
+  public static class InputFormat extends SequenceFileInputFormat {
+    public RecordReader getRecordReader(NutchFileSystem fs, FileSplit split,
+                                        JobConf job, Reporter reporter)
+      throws IOException {
+      reporter.setStatus(split.toString());
+
+      return new SequenceFileRecordReader(fs, split) {
+          public synchronized boolean next(Writable key, Writable value)
+            throws IOException {
+            ObjectWritable wrapper = (ObjectWritable)value;
+            try {
+              wrapper.set(getValueClass().newInstance());
+            } catch (Exception e) {
+              throw new IOException(e.toString());
+            }
+            return super.next(key, (Writable)wrapper.get());
+          }
+        };
+    }
+  }
+
+  /** Implements a text output format*/
+  public static class TextOutputFormat
+  implements org.apache.nutch.mapred.OutputFormat {
+  public RecordWriter getRecordWriter(final NutchFileSystem fs, JobConf job,
+                                      String name) throws IOException {
+
+   final File segmentDumpFile =
+     new File(new File(job.getOutputDir(), SegmentReader.DIR_NAME), name);
+
+   // Get the old copy out of the way
+   fs.delete(segmentDumpFile);
+
+   final PrintStream printStream = new PrintStream(fs.create(segmentDumpFile));
+   return new RecordWriter() {
+       public synchronized void write(WritableComparable key, Writable value)
+         throws IOException {
+         ObjectWritable writable = (ObjectWritable)value;
+         printStream.println((String)writable.get());
+       }
+       public synchronized void close(Reporter reporter) throws IOException {
+         printStream.close();
+       }
+     };
+  }
+}
+
+  public SegmentReader() { 
+      super(null); 
+  }
+
+  public SegmentReader(NutchConf conf) {
+    super(conf);
+  }
+
+  public void configure(JobConf job) {
+  }
+
+  public void reduce(WritableComparable key, Iterator values,
+                     OutputCollector output, Reporter reporter)
+    throws IOException {
+    StringBuffer dump = new StringBuffer();
+    
+    dump.append("\nRecno:: ").append(recNo++).append("\n");
+    dump.append("URL: " + key.toString() + "\n");
+    while (values.hasNext()) {
+      Object value = ((ObjectWritable)values.next()).get(); // unwrap
+      if (value instanceof CrawlDatum) {
+        
dump.append("\nCrawlDatum::\n").append(((CrawlDatum)value).toString());  
+      } else if (value instanceof Content) {
+          dump.append("\nContent::\n").append(((Content)value).toString());
+      } else if (value instanceof ParseData) {
+          dump.append("\nParseData::\n").append(((ParseData)value).toString());
+      } else if (value instanceof ParseText) {
+          dump.append("\nParseText::\n").append(((ParseText)value).toString());
+      } else {
+        LOG.warning("Unrecognized type: " + value.getClass());
+      }
+    }
+    output.collect(key, new ObjectWritable(dump.toString()));
+  }
+
+  public void reader(File segment) throws IOException {
+    LOG.info("Reader: segment: " + segment);
+
+    JobConf job = new JobConf(getConf());
+
+    job.addInputDir(new File(segment, CrawlDatum.GENERATE_DIR_NAME));
+    job.addInputDir(new File(segment, CrawlDatum.FETCH_DIR_NAME));
+    job.addInputDir(new File(segment, CrawlDatum.PARSE_DIR_NAME));
+    job.addInputDir(new File(segment, Content.DIR_NAME));
+    job.addInputDir(new File(segment, ParseData.DIR_NAME));
+    job.addInputDir(new File(segment, ParseText.DIR_NAME));
+
+    job.setInputFormat(InputFormat.class);
+    job.setInputKeyClass(UTF8.class);
+    job.setInputValueClass(ObjectWritable.class);
+
+    job.setReducerClass(SegmentReader.class);
+    
+    job.setOutputDir(segment);
+    job.setOutputFormat(TextOutputFormat.class);
+    job.setOutputKeyClass(UTF8.class);
+    job.setOutputValueClass(ObjectWritable.class);
+
+    JobClient.runJob(job);
+    
+    // concatenate the output
+    NutchFileSystem nfs = NutchFileSystem.get(job);
+    File directory = new File(job.getOutputDir(), SegmentReader.DIR_NAME);
+    File dumpFile = new File(directory, job.get("segment.dump.dir", "dump"));
+
+    // remove the old file 
+    nfs.delete(dumpFile);
+    File[] files = nfs.listFiles(directory);
+    
+    PrintWriter writer = null;
+    int currentReccordNumber = 0;
+    if (files.length > 0) {
+        writer = new PrintWriter(new BufferedWriter(new 
OutputStreamWriter(nfs.create(dumpFile))));
+        try {
+            for (int i = 0 ; i < files.length; i++) {
+                File partFile = (File)files[i];
+                try {
+                    currentReccordNumber = append(nfs, job, partFile, writer, 
currentReccordNumber);
+                } catch (IOException exception) {
+                    LOG.warning("Couldn't copy the content of " + 
partFile.toString() + " into " + dumpFile.toString());
+                    LOG.warning(exception.getMessage());
+                }
+            }
+        }
+        finally {
+            writer.close();
+        }
+    }
+    LOG.info("Reader: done");
+  }
+
+  /** Appends two files and updates the Recno counter*/
+  private int append(NutchFileSystem nfs, NutchConf conf, File src, 
PrintWriter writer, int currentReccordNumber) throws IOException {
+      BufferedReader reader = new BufferedReader(new 
InputStreamReader(nfs.open(src)));
+      try {
+          String line = reader.readLine();
+          while(line != null) {
+              if (line.startsWith("Recno:: ")) {
+                  line = "Recno:: " + currentReccordNumber++;
+              }
+              writer.println(line);
+              line = reader.readLine();
+          }
+          return currentReccordNumber;
+      } finally {
+          reader.close();
+      }
+  }
+  
+  public static void main(String[] args) throws Exception {
+    SegmentReader segmentReader = new SegmentReader(NutchConf.get());
+
+    String usage = "Usage: SegmentReader <segment>";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+    segmentReader.reader(new File(args[0]));
+  }
+}

Propchange: 
lucene/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/NutchConf.java Thu Dec 29 
07:28:30 2005
@@ -91,6 +91,24 @@
 
   /** Returns the value of the <code>name</code> property, or null if no
    * such property exists. */
+  public Object getObject(String name) { return getProps().get(name);}
+
+  /** Sets the value of the <code>name</code> property. */
+  public void setObject(String name, Object value) {
+    getProps().put(name, value);
+  }
+
+  /** Returns the value of the <code>name</code> property.  If no such property
+   * exists, then <code>defaultValue</code> is returned.
+   */
+  public Object get(String name, Object defaultValue) {
+    Object res = getObject(name);
+    if (res != null) return res;
+    else return defaultValue;
+  }
+  
+  /** Returns the value of the <code>name</code> property, or null if no
+   * such property exists. */
   public String get(String name) { return getProps().getProperty(name);}
 
   /** Sets the value of the <code>name</code> property. */

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/util/StringUtil.java Thu Dec 
29 07:28:30 2005
@@ -49,6 +49,79 @@
     return sb.toString();
   }
 
+
+  private static final char[] HEX_DIGITS =
+  {'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
+
+  /**
+   * Convenience call for [EMAIL PROTECTED] #toHexString(byte[], String, 
int)}, where
+   * <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
+   * @param buf
+   * @return
+   */
+  public static String toHexString(byte[] buf) {
+    return toHexString(buf, null, Integer.MAX_VALUE);
+  }
+
+  /**
+   * Get a text representation of a byte[] as hexadecimal String, where each
+   * pair of hexadecimal digits corresponds to consecutive bytes in the array.
+   * @param buf input data
+   * @param sep separate every pair of hexadecimal digits with this separator, 
or
+   * null if no separation is needed.
+   * @param lineLen break the output String into lines containing output for 
lineLen
+   * bytes.
+   */
+  public static String toHexString(byte[] buf, String sep, int lineLen) {
+    if (buf == null) return null;
+    if (lineLen <= 0) lineLen = Integer.MAX_VALUE;
+    StringBuffer res = new StringBuffer(buf.length * 2);
+    for (int i = 0; i < buf.length; i++) {
+      int b = buf[i];
+      res.append(HEX_DIGITS[(b >> 4) & 0xf]);
+      res.append(HEX_DIGITS[b & 0xf]);
+      if (i > 0 && (i % lineLen) == 0) res.append('\n');
+      else if (sep != null && i < lineLen - 1) res.append(sep); 
+    }
+    return res.toString();
+  }
+  
+  /**
+   * Convert a String containing consecutive (no inside whitespace) hexadecimal
+   * digits into a corresponding byte array. If the number of digits is not 
even,
+   * a '0' will be appended in the front of the String prior to conversion.
+   * Leading and trailing whitespace is ignored.
+   * @param text input text
+   * @return converted byte array, or null if unable to convert
+   */
+  public static byte[] fromHexString(String text) {
+    text = text.trim();
+    if (text.length() % 2 != 0) text = "0" + text;
+    int resLen = text.length() / 2;
+    int loNibble, hiNibble;
+    byte[] res = new byte[resLen];
+    for (int i = 0; i < resLen; i++) {
+      int j = i << 1;
+      hiNibble = charToNibble(text.charAt(j));
+      loNibble = charToNibble(text.charAt(j + 1));
+      if (loNibble == -1 || hiNibble == -1) return null;
+      res[i] = (byte)(hiNibble << 4 | loNibble);
+    }
+    return res;
+  }
+  
+  private static final int charToNibble(char c) {
+    if (c >= '0' && c <= '9') {
+      return c - '0';
+    } else if (c >= 'a' && c <= 'f') {
+      return 0xa + (c - 'a');
+    } else if (c >= 'A' && c <= 'F') {
+      return 0xA + (c - 'A');
+    } else {
+      return -1;
+    }
+  }
+
   /**
    * Parse the character encoding from the specified content type header.
    * If the content type is null, or there is no explicit character encoding,

Modified: 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: 
http://svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=359822&r1=359821&r2=359822&view=diff
==============================================================================
--- 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 (original)
+++ 
lucene/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 Thu Dec 29 07:28:30 2005
@@ -70,7 +70,7 @@
     
     // anchors are indexed, so they're searchable, but not stored in index
     try {
-      String[] anchors = inlinks.getAnchors();
+      String[] anchors = (inlinks != null ? inlinks.getAnchors() : new 
String[0]);
       for (int i = 0; i < anchors.length; i++) {
         doc.add(Field.UnStored("anchor", anchors[i]));
       }

svn commit: r359822 - in /lucene/nutch/trunk: bin/ conf/ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/parse/ src/java/org/apache/nutch/segment/ src/java/org/apache/nutc...

Reply via email to