Author: snagel
Date: Fri Aug 22 22:23:27 2014
New Revision: 1619942

URL: http://svn.apache.org/r1619942
Log:
NUTCH-1693 TextMD5Signature computed on textual content

Added:
    nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java   
(with props)
    nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java   (with 
props)
Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
    nutch/trunk/CHANGES.txt

Modified: nutch/branches/2.x/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1619942&r1=1619941&r2=1619942&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Aug 22 22:23:27 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, 
markus via snagel)
+
 * NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, 
generate.max.per.host.by.ip (Matthias Agethle via snagel)
 
 * NUTCH-1819 batchId in GeneratorJob ( Fjodor Vershinin via lewismc)

Added: nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java?rev=1619942&view=auto
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java 
(added)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java 
Fri Aug 22 22:23:27 2014
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import java.util.Collection;
+import java.util.HashSet;
+
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.nutch.storage.WebPage;
+
+/**
+ * Default implementation of a page signature. It calculates an MD5 hash of the
+ * textual content of a page. In case there is no text, it calculates a hash
+ * from the page's fetched content.
+ */
+public class TextMD5Signature extends Signature {
+
+  private final static Collection<WebPage.Field> FIELDS = new 
HashSet<WebPage.Field>();
+
+  static {
+    FIELDS.add(WebPage.Field.TEXT);
+  }
+
+  Signature fallback = new MD5Signature();
+
+  @Override
+  public byte[] calculate(WebPage page) {
+    CharSequence text = page.getText();
+
+    if (text == null || text.length() == 0) {
+      return fallback.calculate(page);
+    }
+
+    return MD5Hash.digest(text.toString()).getDigest();
+  }
+
+  @Override
+  public Collection<WebPage.Field> getFields() {
+    Collection<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS);
+    fields.addAll(fallback.getFields());
+    return fields;
+  }
+}

Propchange: 
nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1619942&r1=1619941&r2=1619942&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Fri Aug 
22 22:23:27 2014
@@ -187,8 +187,6 @@ public class ParseUtil extends Configure
       return;
     }
 
-    final byte[] signature = sig.calculate(page);
-
     org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus();
     page.setParseStatus(pstatus);
     if (ParseStatusUtils.isSuccess(pstatus)) {
@@ -233,6 +231,7 @@ public class ParseUtil extends Configure
         if (prevSig != null) {
           page.setPrevSignature(prevSig);
         }
+        final byte[] signature = sig.calculate(page);
         page.setSignature(ByteBuffer.wrap(signature));
         if (page.getOutlinks() != null) {
           page.getOutlinks().clear();

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1619942&r1=1619941&r2=1619942&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Aug 22 22:23:27 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, 
markus via snagel)
+
 * NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, 
generate.max.per.host.by.ip (Matthias Agethle via snagel)
 
 Nutch 1.9 Release Change Log - 12/08/2014 (dd/mm/yyyy)

Added: nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java?rev=1619942&view=auto
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java (added)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java Fri Aug 
22 22:23:27 2014
@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.crawl;
+
+import org.apache.hadoop.io.MD5Hash;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.protocol.Content;
+
+/**
+ * Implementation of a page signature. It calculates an MD5 hash
+ * of the textual content of a page. In case there is no content, it
+ * calculates a hash from the page's URL.
+ */
+public class TextMD5Signature extends Signature {
+
+  Signature fallback = new MD5Signature();
+
+  public byte[] calculate(Content content, Parse parse) {
+    String text = parse.getText();
+
+    if (text == null || text.length() == 0) {
+      return fallback.calculate(content, parse);
+    }
+    
+    return MD5Hash.digest(text).getDigest();
+  }
+}

Propchange: nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java
------------------------------------------------------------------------------
    svn:eol-style = native


Reply via email to