Author: snagel Date: Fri Aug 22 22:23:27 2014 New Revision: 1619942 URL: http://svn.apache.org/r1619942 Log: NUTCH-1693 TextMD5Signature computed on textual content
Added: nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java (with props) nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java (with props) Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java nutch/trunk/CHANGES.txt Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1619942&r1=1619941&r2=1619942&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Fri Aug 22 22:23:27 2014 @@ -2,6 +2,8 @@ Nutch Change Log Current Development +* NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, markus via snagel) + * NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, generate.max.per.host.by.ip (Matthias Agethle via snagel) * NUTCH-1819 batchId in GeneratorJob ( Fjodor Vershinin via lewismc) Added: nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java?rev=1619942&view=auto ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java (added) +++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java Fri Aug 22 22:23:27 2014 @@ -0,0 +1,58 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import java.util.Collection; +import java.util.HashSet; + +import org.apache.hadoop.io.MD5Hash; +import org.apache.nutch.storage.WebPage; + +/** + * Default implementation of a page signature. It calculates an MD5 hash of the + * textual content of a page. In case there is no text, it calculates a hash + * from the page's fetched content. + */ +public class TextMD5Signature extends Signature { + + private final static Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>(); + + static { + FIELDS.add(WebPage.Field.TEXT); + } + + Signature fallback = new MD5Signature(); + + @Override + public byte[] calculate(WebPage page) { + CharSequence text = page.getText(); + + if (text == null || text.length() == 0) { + return fallback.calculate(page); + } + + return MD5Hash.digest(text.toString()).getDigest(); + } + + @Override + public Collection<WebPage.Field> getFields() { + Collection<WebPage.Field> fields = new HashSet<WebPage.Field>(FIELDS); + fields.addAll(fallback.getFields()); + return fields; + } +} Propchange: nutch/branches/2.x/src/java/org/apache/nutch/crawl/TextMD5Signature.java ------------------------------------------------------------------------------ svn:eol-style = native Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java?rev=1619942&r1=1619941&r2=1619942&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParseUtil.java Fri Aug 22 22:23:27 2014 @@ -187,8 +187,6 @@ public class ParseUtil extends Configure return; } - final byte[] signature = sig.calculate(page); - org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus(); page.setParseStatus(pstatus); if (ParseStatusUtils.isSuccess(pstatus)) { @@ -233,6 +231,7 @@ public class ParseUtil extends Configure if (prevSig != null) { page.setPrevSignature(prevSig); } + final byte[] signature = sig.calculate(page); page.setSignature(ByteBuffer.wrap(signature)); if (page.getOutlinks() != null) { page.getOutlinks().clear(); Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1619942&r1=1619941&r2=1619942&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Fri Aug 22 22:23:27 2014 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Current Development +* NUTCH-1693 TextMD5Signature computed on textual content (Tien Nguyen Manh, markus via snagel) + * NUTCH-1409 remove deprecated properties db.{default,max}.fetch.interval, generate.max.per.host.by.ip (Matthias Agethle via snagel) Nutch 1.9 Release Change Log - 12/08/2014 (dd/mm/yyyy) Added: nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java?rev=1619942&view=auto ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java (added) +++ nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java Fri Aug 22 22:23:27 2014 @@ -0,0 +1,42 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.crawl; + +import org.apache.hadoop.io.MD5Hash; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.protocol.Content; + +/** + * Implementation of a page signature. It calculates an MD5 hash + * of the textual content of a page. In case there is no content, it + * calculates a hash from the page's URL. + */ +public class TextMD5Signature extends Signature { + + Signature fallback = new MD5Signature(); + + public byte[] calculate(Content content, Parse parse) { + String text = parse.getText(); + + if (text == null || text.length() == 0) { + return fallback.calculate(content, parse); + } + + return MD5Hash.digest(text).getDigest(); + } +} Propchange: nutch/trunk/src/java/org/apache/nutch/crawl/TextMD5Signature.java ------------------------------------------------------------------------------ svn:eol-style = native