Author: lewismc
Date: Thu Jun 27 17:04:42 2013
New Revision: 1497447
URL: http://svn.apache.org/r1497447
Log:
NUTCH-1591 Incorrect conversion of ByteBuffer to String
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureComparator.java
nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java
nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java
nutch/branches/2.x/src/java/org/apache/nutch/util/Bytes.java
nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java
nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
Modified: nutch/branches/2.x/CHANGES.txt
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Thu Jun 27 17:04:42 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1591 Incorrect conversion of ByteBuffer to String (Jason Howes via
lewismc)
+
* NUTCH-1571 SolrInputSplit doesn't implement Writable and crawl script
doesn't pass crawlId to generate and updatedb tasks (yuanyun.cn via lewismc)
* NUTCH-1126 JUnit test for urlfilter-prefix (Talat UYARER via markus)
Modified: nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/api/DbReader.java Thu Jun 27
17:04:42 2013
@@ -195,7 +195,7 @@ public class DbReader {
while (iterator.hasNext()) {
Entry<Utf8, ByteBuffer> entry = iterator.next();
simpleMeta.put(entry.getKey().toString(),
- Bytes.toStringBinary(entry.getValue().array()));
+ Bytes.toStringBinary(entry.getValue()));
}
}
res.put(f, simpleMeta);
@@ -207,10 +207,10 @@ public class DbReader {
res.put(f, ParseStatusUtils.toString(ps));
} else if ("signature".equals(f)) {
ByteBuffer bb = page.getSignature();
- res.put(f, StringUtil.toHexString(bb.array()));
+ res.put(f, StringUtil.toHexString(bb));
} else if ("content".equals(f)) {
ByteBuffer bb = page.getContent();
- res.put(f, Bytes.toStringBinary(bb.array()));
+ res.put(f, Bytes.toStringBinary(bb));
} else if ("markers".equals(f)) {
res.put(f, convertMap(page.getMarkers()));
} else if ("inlinks".equals(f)) {
@@ -221,7 +221,7 @@ public class DbReader {
if (val instanceof Utf8) {
val = val.toString();
} else if (val instanceof ByteBuffer) {
- val = Bytes.toStringBinary(((ByteBuffer)val).array());
+ val = Bytes.toStringBinary((ByteBuffer)val);
}
res.put(f, val);
}
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/DbUpdateReducer.java Thu
Jun 27 17:04:42 2013
@@ -118,7 +118,7 @@ extends GoraReducer<UrlWithScore, NutchW
ByteBuffer prevSig = page.getPrevSignature();
ByteBuffer signature = page.getSignature();
if (prevSig != null && signature != null) {
- if (SignatureComparator.compare(prevSig.array(), signature.array())
!= 0) {
+ if (SignatureComparator.compare(prevSig, signature) != 0) {
modified = FetchSchedule.STATUS_MODIFIED;
} else {
modified = FetchSchedule.STATUS_NOTMODIFIED;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/MD5Signature.java Thu
Jun 27 17:04:42 2013
@@ -17,9 +17,11 @@
package org.apache.nutch.crawl;
+import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.HashSet;
+import org.apache.avro.util.Utf8;
import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.storage.WebPage;
@@ -40,9 +42,29 @@ public class MD5Signature extends Signat
@Override
public byte[] calculate(WebPage page) {
- byte[] data = page.getContent().array();
- if (data == null && page.getBaseUrl()!=null) data =
page.getBaseUrl().getBytes();
- return MD5Hash.digest(data).getDigest();
+ ByteBuffer buf = page.getContent();
+ byte[] data;
+ int of;
+ int cb;
+ if (buf == null) {
+ Utf8 baseUrl = page.getBaseUrl();
+ if (baseUrl == null) {
+ data = null;
+ of = 0;
+ cb = 0;
+ }
+ else {
+ data = baseUrl.getBytes();
+ of = 0;
+ cb = baseUrl.getLength();
+ }
+ } else {
+ data = buf.array();
+ of = buf.arrayOffset() + buf.position();
+ cb = buf.remaining();
+ }
+
+ return MD5Hash.digest(data, of, cb).getDigest();
}
@Override
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureComparator.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureComparator.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureComparator.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/SignatureComparator.java
Thu Jun 27 17:04:42 2013
@@ -17,13 +17,23 @@
package org.apache.nutch.crawl;
+import java.nio.ByteBuffer;
+
public class SignatureComparator {
public static int compare(byte[] data1, byte[] data2) {
if (data1 == null && data2 == null) return 0;
if (data1 == null) return -1;
if (data2 == null) return 1;
- return _compare(data1, 0, data1.length, data2, 0, data2.length); }
-
+ return _compare(data1, 0, data1.length, data2, 0, data2.length);
+ }
+
+ public static int compare(ByteBuffer buf1, ByteBuffer buf2) {
+ if (buf1 == null && buf2 == null) return 0;
+ if (buf1 == null) return -1;
+ if (buf2 == null) return 1;
+ return _compare(buf1.array(), buf1.arrayOffset() + buf1.position(),
buf1.remaining(),
+ buf2.array(), buf2.arrayOffset() + buf2.position(),
buf2.remaining());
+ }
public static int _compare(byte[] data1, int s1, int l1, byte[] data2, int
s2, int l2) {
if (l2 > l1) return -1;
Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java Thu
Jun 27 17:04:42 2013
@@ -355,11 +355,11 @@ public class WebTableReader extends Nutc
ProtocolStatusUtils.toString(page.getProtocolStatus())).append("\n");
ByteBuffer prevSig = page.getPrevSignature();
if (prevSig != null) {
- sb.append("prevSignature:\t" +
StringUtil.toHexString(prevSig.array())).append("\n");
+ sb.append("prevSignature:\t" +
StringUtil.toHexString(prevSig)).append("\n");
}
ByteBuffer sig = page.getSignature();
if (sig != null) {
- sb.append("signature:\t" +
StringUtil.toHexString(sig.array())).append("\n");
+ sb.append("signature:\t" + StringUtil.toHexString(sig)).append("\n");
}
sb.append("parseStatus:\t" +
ParseStatusUtils.toString(page.getParseStatus())).append("\n");
@@ -380,7 +380,7 @@ public class WebTableReader extends Nutc
while (iterator.hasNext()) {
Entry<Utf8, ByteBuffer> entry = iterator.next();
sb.append("metadata " + entry.getKey().toString()).append(" : \t")
- .append(Bytes.toString(entry.getValue().array())).append("\n");
+ .append(Bytes.toString(entry.getValue())).append("\n");
}
}
if (dumpLinks) {
@@ -409,7 +409,7 @@ public class WebTableReader extends Nutc
if (content != null && dumpContent) {
sb.append("contentType:\t" + page.getContentType()).append("\n");
sb.append("content:start:\n");
- sb.append(Bytes.toString(content.array()));
+ sb.append(Bytes.toString(content));
sb.append("\ncontent:end:\n");
}
Utf8 text = page.getText();
Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java Thu Jun
27 17:04:42 2013
@@ -62,7 +62,7 @@ public class IndexUtil {
public NutchDocument index(String key, WebPage page) {
NutchDocument doc = new NutchDocument();
doc.add("id", key);
- doc.add("digest", StringUtil.toHexString(page.getSignature().array()));
+ doc.add("digest", StringUtil.toHexString(page.getSignature()));
if (page.getBatchId() != null) {
doc.add("batchId", page.getBatchId().toString());
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Thu
Jun 27 17:04:42 2013
@@ -171,7 +171,7 @@ public class ParserChecker implements To
while (iterator.hasNext()) {
Entry<Utf8, ByteBuffer> entry = iterator.next();
sb.append(entry.getKey().toString()).append(" : \t")
- .append(Bytes.toString(entry.getValue().array())).append("\n");
+ .append(Bytes.toString(entry.getValue())).append("\n");
}
System.out.print(sb.toString());
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/storage/Host.java Thu Jun 27
17:04:42 2013
@@ -18,23 +18,14 @@ package org.apache.nutch.storage;
import java.nio.ByteBuffer;
import java.util.Map;
-import java.util.HashMap;
-import org.apache.avro.Protocol;
import org.apache.avro.Schema;
import org.apache.avro.AvroRuntimeException;
-import org.apache.avro.Protocol;
import org.apache.avro.util.Utf8;
-import org.apache.avro.ipc.AvroRemoteException;
-import org.apache.avro.generic.GenericArray;
-import org.apache.avro.specific.SpecificExceptionBase;
-import org.apache.avro.specific.SpecificRecordBase;
-import org.apache.avro.specific.SpecificRecord;
-import org.apache.avro.specific.SpecificFixed;
import org.apache.gora.persistency.StateManager;
import org.apache.gora.persistency.impl.PersistentBase;
import org.apache.gora.persistency.impl.StateManagerImpl;
import org.apache.gora.persistency.StatefulHashMap;
-import org.apache.gora.persistency.ListGenericArray;
+import org.apache.nutch.util.Bytes;
@SuppressWarnings("all")
public class Host extends PersistentBase {
@@ -152,7 +143,7 @@ public class Host extends PersistentBase
public String getValue(String key, String defaultValue) {
if (!contains(key)) return defaultValue;
- return new String(metadata.get(new Utf8(key)).array());
+ return Bytes.toString(metadata.get(new Utf8(key)));
}
public int getInt(String key, int defaultValue) {
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/Bytes.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/Bytes.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/Bytes.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/Bytes.java Thu Jun 27
17:04:42 2013
@@ -273,12 +273,27 @@ public class Bytes {
* @return the byte array
*/
public static byte[] toBytes(ByteBuffer bb) {
- int length = bb.limit();
+ int length = bb.remaining();
byte[] result = new byte[length];
- System.arraycopy(bb.array(), bb.arrayOffset(), result, 0,
length);
+ System.arraycopy(bb.array(), bb.arrayOffset() + bb.position(),
result, 0, length);
return result;
}
+ /**
+ * This method will convert utf8 encoded bytes into a string. If an
+ * UnsupportedEncodingException occurs, this method will eat it and return
+ * null instead.
+ *
+ * @param bb
+ * Presumed UTF-8 encoded ByteBuffer.
+ * @return String made from <code>b</code> or null
+ */
+ public static String toString(ByteBuffer bb) {
+ return bb == null
+ ? null
+ : toString(bb.array(), bb.arrayOffset() + bb.position(),
bb.remaining());
+ }
+
/**
* @param b
* Presumed UTF-8 encoded byte array.
@@ -333,6 +348,20 @@ public class Bytes {
}
}
+ /**
+ * Write a printable representation of a ByteBuffer. Non-printable
+ * characters are hex escaped in the format \\x%02X, eg: \x00 \x05 etc
+ *
+ * @param bb
+ * ByteBuffer to write out
+ * @return string output
+ */
+ public static String toStringBinary(ByteBuffer bb) {
+ return bb == null
+ ? null
+ : toStringBinary(bb.array(), bb.arrayOffset() + bb.position(),
bb.remaining());
+ }
+
/**
* Write a printable representation of a byte array.
*
Modified:
nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java
(original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/EncodingDetector.java Thu
Jun 27 17:04:42 2013
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.util;
+import java.io.ByteArrayInputStream;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
@@ -169,21 +170,20 @@ public class EncodingDetector {
private void autoDetectClues(ByteBuffer dataBuffer, Utf8 typeUtf8,
String encoding, boolean filter) {
- byte[] data = dataBuffer.array();
+ int length = dataBuffer.remaining();
String type = TableUtil.toString(typeUtf8);
if (minConfidence >= 0 && DETECTABLES.contains(type)
- && data.length > MIN_LENGTH) {
+ && length > MIN_LENGTH) {
CharsetMatch[] matches = null;
// do all these in a try/catch; setText and detect/detectAll
// will sometimes throw exceptions
try {
detector.enableInputFilter(filter);
- if (data.length > MIN_LENGTH) {
- detector.setText(data);
- matches = detector.detectAll();
- }
+ detector.setText(new ByteArrayInputStream(dataBuffer.array(),
+ dataBuffer.arrayOffset() + dataBuffer.position(), length));
+ matches = detector.detectAll();
} catch (Exception e) {
LOG.debug("Exception from ICU4J (ignoring): ", e);
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/util/StringUtil.java Thu Jun
27 17:04:42 2013
@@ -17,6 +17,8 @@
package org.apache.nutch.util;
+import java.nio.ByteBuffer;
+
/**
* A collection of String processing utility methods.
*/
@@ -52,6 +54,28 @@ public class StringUtil {
{'0','1','2','3','4','5','6','7','8','9','a','b','c','d','e','f'};
/**
+ * Convenience call for {@link #toHexString(ByteBuffer, String, int)}, where
+ * <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
+ * @param buf
+ */
+ public static String toHexString(ByteBuffer buf) {
+ return toHexString(buf, null, Integer.MAX_VALUE);
+ }
+
+ /**
+ * Get a text representation of a ByteBuffer as hexadecimal String, where
each
+ * pair of hexadecimal digits corresponds to consecutive bytes in the array.
+ * @param buf input data
+ * @param sep separate every pair of hexadecimal digits with this separator,
or
+ * null if no separation is needed.
+ * @param lineLen break the output String into lines containing output for
lineLen
+ * bytes.
+ */
+ public static String toHexString(ByteBuffer buf, String sep, int lineLen) {
+ return toHexString(buf.array(), buf.arrayOffset() + buf.position(),
buf.remaining(), sep, lineLen);
+ }
+
+ /**
* Convenience call for {@link #toHexString(byte[], String, int)}, where
* <code>sep = null; lineLen = Integer.MAX_VALUE</code>.
* @param buf
@@ -70,15 +94,30 @@ public class StringUtil {
* bytes.
*/
public static String toHexString(byte[] buf, String sep, int lineLen) {
+ return toHexString(buf, 0, buf.length, sep, lineLen);
+ }
+
+ /**
+ * Get a text representation of a byte[] as hexadecimal String, where each
+ * pair of hexadecimal digits corresponds to consecutive bytes in the array.
+ * @param buf input data
+ * @param of the offset into the byte[] to start reading
+ * @param cb the number of bytes to read from the byte[]
+ * @param sep separate every pair of hexadecimal digits with this separator,
or
+ * null if no separation is needed.
+ * @param lineLen break the output String into lines containing output for
lineLen
+ * bytes.
+ */
+ public static String toHexString(byte[] buf, int of, int cb, String sep, int
lineLen) {
if (buf == null) return null;
if (lineLen <= 0) lineLen = Integer.MAX_VALUE;
- StringBuffer res = new StringBuffer(buf.length * 2);
- for (int i = 0; i < buf.length; i++) {
- int b = buf[i];
+ StringBuffer res = new StringBuffer(cb * 2);
+ for (int c = 0; c < cb; c++) {
+ int b = buf[of++];
res.append(HEX_DIGITS[(b >> 4) & 0xf]);
res.append(HEX_DIGITS[b & 0xf]);
- if (i > 0 && (i % lineLen) == 0) res.append('\n');
- else if (sep != null && i < lineLen - 1) res.append(sep);
+ if (c > 0 && (c % lineLen) == 0) res.append('\n');
+ else if (sep != null && c < lineLen - 1) res.append(sep);
}
return res.toString();
}
Modified:
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
(original)
+++
nutch/branches/2.x/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCIndexingFilter.java
Thu Jun 27 17:04:42 2013
@@ -34,6 +34,7 @@ import org.apache.nutch.indexer.NutchDoc
import org.apache.nutch.metadata.CreativeCommons;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.util.Bytes;
/** Adds basic searchable fields to a document. */
public class CCIndexingFilter implements IndexingFilter {
@@ -102,7 +103,7 @@ public class CCIndexingFilter implements
ByteBuffer blicense = page.getFromMetadata(new Utf8(
CreativeCommons.LICENSE_URL));
if (blicense != null) {
- String licenseUrl = new String(blicense.array());
+ String licenseUrl = Bytes.toString(blicense);
if (LOG.isInfoEnabled()) {
LOG.info("CC: indexing " + licenseUrl + " for: "
+ url.toString());
@@ -119,7 +120,7 @@ public class CCIndexingFilter implements
ByteBuffer blicenseloc = page.getFromMetadata(new Utf8(
CreativeCommons.LICENSE_LOCATION));
if (blicenseloc != null) {
- String licenseLocation = new
String(blicenseloc.array());
+ String licenseLocation = Bytes.toString(blicenseloc);
addFeature(doc, "meta=" + licenseLocation);
}
@@ -127,7 +128,7 @@ public class CCIndexingFilter implements
ByteBuffer bworkType = page.getFromMetadata(new Utf8(
CreativeCommons.WORK_TYPE));
if (bworkType != null) {
- String workType = new String(bworkType.array());
+ String workType = Bytes.toString(bworkType);
addFeature(doc, workType);
}
Modified:
nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
(original)
+++
nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java
Thu Jun 27 17:04:42 2013
@@ -21,6 +21,7 @@ import org.apache.nutch.parse.ParseUtil;
import org.apache.avro.util.Utf8;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
import java.io.*;
@@ -73,13 +74,10 @@ public class TestCCParseFilter {
new ParseUtil(conf).parse(url, page);
ByteBuffer bb = page.getFromMetadata(new Utf8("License-Url"));
- assertEquals(license, new String(bb.array()));
+ assertEquals(license, Bytes.toString(bb));
bb = page.getFromMetadata(new Utf8("License-Location"));
- assertEquals(location, new String(bb.array()));
+ assertEquals(location, Bytes.toString(bb));
bb = page.getFromMetadata(new Utf8("Work-Type"));
- if (bb == null)
- assertEquals(type, null);
- else
- assertEquals(type, new String(bb.array()));
+ assertEquals(type, Bytes.toString(bb));
}
}
Modified:
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
(original)
+++
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
Thu Jun 27 17:04:42 2013
@@ -119,8 +119,7 @@ public class BasicIndexingFilter impleme
// add cached content/summary display policy, if available
ByteBuffer cachingRaw = page
.getFromMetadata(Nutch.CACHING_FORBIDDEN_KEY_UTF8);
- String caching = (cachingRaw == null ? null : Bytes.toString(cachingRaw
- .array()));
+ String caching = Bytes.toString(cachingRaw);
if (caching != null && !caching.equals(Nutch.CACHING_FORBIDDEN_NONE)) {
doc.add("cache", caching);
}
Modified:
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
(original)
+++
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java
Thu Jun 27 17:04:42 2013
@@ -132,7 +132,7 @@ public class HTMLLanguageParser implemen
LanguageParser parser = new LanguageParser(doc);
lang = parser.getLanguage();
} else
- lang = Bytes.toString(blang.array());
+ lang = Bytes.toString(blang);
if (lang != null) {
return lang;
Modified:
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
(original)
+++
nutch/branches/2.x/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
Thu Jun 27 17:04:42 2013
@@ -61,11 +61,8 @@ public class LanguageIndexingFilter impl
throws IndexingException {
// check if LANGUAGE found, possibly put there by HTMLLanguageParser
- String lang = null;
ByteBuffer blang = page.getFromMetadata(new Utf8(Metadata.LANGUAGE));
- if (blang != null) {
- lang = Bytes.toString(blang.array());
- }
+ String lang = Bytes.toString(blang);
if (lang == null || lang.length() == 0) {
lang = "unknown";
Modified:
nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
(original)
+++
nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java
Thu Jun 27 17:04:42 2013
@@ -60,9 +60,7 @@ public class TestHTMLLanguageParser {
WebPage page = getPage(docs[t]);
parser.parse(URL.toString(), page);
ByteBuffer blang = page.getFromMetadata(new Utf8(Metadata.LANGUAGE));
- String lang = null;
- if (blang != null)
- lang = Bytes.toString(blang.array());
+ String lang = Bytes.toString(blang);
assertEquals(metalanguages[t], lang);
}
} catch (Exception e) {
Modified:
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
(original)
+++
nutch/branches/2.x/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java
Thu Jun 27 17:04:42 2013
@@ -28,6 +28,7 @@ import org.apache.nutch.indexer.Indexing
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
+import org.apache.nutch.util.Bytes;
/**
* An {@link org.apache.nutch.indexer.IndexingFilter} that adds
<code>tag</code>
@@ -87,7 +88,7 @@ public class RelTagIndexingFilter implem
ByteBuffer bb = page.getFromMetadata(new Utf8(RelTagParser.REL_TAG));
if (bb != null) {
- String[] tags = new String(bb.array()).split("\t");
+ String[] tags = Bytes.toString(bb).split("\t");
for (int i = 0; i < tags.length; i++) {
doc.add("tag", tags[i]);
}
Modified:
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
(original)
+++
nutch/branches/2.x/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
Thu Jun 27 17:04:42 2013
@@ -85,7 +85,7 @@ public class HtmlParser implements Parse
private String parserImpl;
/**
- * Given a <code>byte[]</code> representing an html file of an
+ * Given a <code>ByteBuffer</code> representing an html file of an
* <em>unknown</em> encoding, read out 'charset' parameter in the meta tag
* from the first <code>CHUNK_SIZE</code> bytes.
* If there's no meta tag for Content-Type or no charset is specified,
@@ -97,12 +97,11 @@ public class HtmlParser implements Parse
* See also http://www.w3.org/TR/REC-xml/#sec-guessing
* <br />
*
- * @param content <code>byte[]</code> representation of an html file
+ * @param content <code>ByteBuffer</code> representation of an html file
*/
- private static String sniffCharacterEncoding(byte[] content) {
- int length = content.length < CHUNK_SIZE ?
- content.length : CHUNK_SIZE;
+ private static String sniffCharacterEncoding(ByteBuffer content) {
+ int length = Math.min(content.remaining(), CHUNK_SIZE);
// We don't care about non-ASCII parts so that it's sufficient
// to just inflate each byte to a 16-bit value by padding.
@@ -110,8 +109,8 @@ public class HtmlParser implements Parse
// {U+0041, U+0082, U+00B7}.
String str = "";
try {
- str = new String(content, 0, length,
- Charset.forName("ASCII").toString());
+ str = new String(content.array(), content.arrayOffset() +
content.position(),
+ length, Charset.forName("ASCII").toString());
} catch (UnsupportedEncodingException e) {
// code should never come here, but just in case...
return null;
@@ -157,8 +156,9 @@ public class HtmlParser implements Parse
// parse the content
DocumentFragment root;
try {
- byte[] contentInOctets = page.getContent().array();
- InputSource input = new InputSource(new
ByteArrayInputStream(contentInOctets));
+ ByteBuffer contentInOctets = page.getContent();
+ InputSource input = new InputSource(new
ByteArrayInputStream(contentInOctets.array(),
+ contentInOctets.arrayOffset() + contentInOctets.position(),
contentInOctets.remaining()));
EncodingDetector detector = new EncodingDetector(conf);
detector.autoDetectClues(page, true);
Modified:
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
(original)
+++
nutch/branches/2.x/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
Thu Jun 27 17:04:42 2013
@@ -39,6 +39,7 @@ import org.apache.nutch.parse.ParseStatu
import org.apache.nutch.parse.Parser;
import org.apache.nutch.storage.ParseStatus;
import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.TableUtil;
import org.apache.oro.text.regex.MatchResult;
@@ -163,7 +164,7 @@ public class JSParseFilter implements Pa
if (type != null && !type.trim().equals("") &&
!type.toLowerCase().startsWith("application/x-javascript"))
return
ParseStatusUtils.getEmptyParse(ParseStatusCodes.FAILED_INVALID_FORMAT,
"Content not JavaScript: '" + type + "'", getConf());
- String script = new String(page.getContent().array());
+ String script = Bytes.toString(page.getContent());
Outlink[] outlinks = getJSLinks(script, "", url);
if (outlinks == null) outlinks = new Outlink[0];
// Title? use the first line of the script...
Modified:
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
(original)
+++
nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
Thu Jun 27 17:04:42 2013
@@ -51,7 +51,6 @@ import org.apache.nutch.util.NutchConfig
import org.apache.nutch.util.TableUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MimeType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.w3c.dom.DocumentFragment;
@@ -92,7 +91,7 @@ public class TikaParser implements org.a
// get the right parser using the mime type as a clue
String mimeType = page.getContentType().toString();
Parser parser = tikaConfig.getParser(mimeType);
- byte[] raw = page.getContent().array();
+ ByteBuffer raw = page.getContent();
if (parser == null) {
String message = "Can't retrieve Tika parser for mime-type " + mimeType;
@@ -114,7 +113,8 @@ public class TikaParser implements org.a
// to add once available in Tika
// context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
try {
- parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
+ parser.parse(new ByteArrayInputStream(raw.array(), raw.arrayOffset() +
raw.position(),
+ raw.remaining()), domhandler, tikamd, context);
} catch (Exception e) {
LOG.error("Error parsing "+url,e);
return ParseStatusUtils.getEmptyParse(e, getConf());
Modified:
nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
---
nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
(original)
+++
nutch/branches/2.x/src/plugin/scoring-opic/src/java/org/apache/nutch/scoring/opic/OPICScoringFilter.java
Thu Jun 27 17:04:42 2013
@@ -111,7 +111,7 @@ public class OPICScoringFilter implement
ByteBuffer cashRaw = row.getFromMetadata(CASH_KEY);
float cash = 0.0f;
if (cashRaw != null) {
- cash = Bytes.toFloat(cashRaw.array());
+ cash = Bytes.toFloat(cashRaw.array(), cashRaw.arrayOffset() +
cashRaw.position());
}
row.putToMetadata(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(cash + adjust)));
}
@@ -125,7 +125,7 @@ public class OPICScoringFilter implement
if (cashRaw == null) {
return;
}
- float cash = Bytes.toFloat(cashRaw.array());
+ float cash = Bytes.toFloat(cashRaw.array(), cashRaw.arrayOffset() +
cashRaw.position());
if (cash == 0) {
return;
}
Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java Thu
Jun 27 17:04:42 2013
@@ -25,8 +25,8 @@ import org.apache.avro.util.Utf8;
import org.apache.hadoop.fs.Path;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.util.AbstractNutchTest;
+import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.CrawlTestUtil;
-import org.apache.gora.util.ByteUtils;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
@@ -113,7 +113,7 @@ public class TestInjector extends Abstra
representation += "\tnutch.score=" + (int)page.getScore();
ByteBuffer bb = page.getFromMetadata(new Utf8("custom.attribute"));
if (bb != null) {
- representation += "\tcustom.attribute=" +
ByteUtils.toString(bb.array());
+ representation += "\tcustom.attribute=" + Bytes.toString(bb);
}
read.add(representation);
}
Modified: nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL:
http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=1497447&r1=1497446&r2=1497447&view=diff
==============================================================================
--- nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java
(original)
+++ nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java Thu
Jun 27 17:04:42 2013
@@ -27,6 +27,7 @@ import org.apache.nutch.crawl.InjectorJo
import org.apache.nutch.crawl.URLWebPage;
import org.apache.nutch.storage.Mark;
import org.apache.nutch.util.AbstractNutchTest;
+import org.apache.nutch.util.Bytes;
import org.apache.nutch.util.CrawlTestUtil;
import org.mortbay.jetty.Server;
@@ -113,7 +114,7 @@ public class TestFetcher extends Abstrac
if (bb == null) {
continue;
}
- String content = new String(bb.array());
+ String content = Bytes.toString(bb);
if (content.indexOf("Nutch fetcher test page")!=-1) {
handledurls.add(up.getUrl());
}