[
https://issues.apache.org/jira/browse/NUTCH-1693?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13865306#comment-13865306
]
Markus Jelsma commented on NUTCH-1693:
--------------------------------------
By the way, there are several places in Nutch that still use getBytes(), would
you suggest we do something about that too?
{code}
markus@midas:~/projects/apache/nutch/trunk$ grep -nr getBytes src/ | grep -v svn
src/test/org/apache/nutch/protocol/TestContent.java:48: Content r = new
Content(url, url, page.getBytes("UTF8"), "text/html",
src/test/org/apache/nutch/protocol/TestContent.java:64:
"".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:70:
"".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:76:
"".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:82:
"<html></html>".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:88:
"<html></html>".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:94:
"<html></html>".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:100:
"".getBytes("UTF8"),
src/test/org/apache/nutch/protocol/TestContent.java:106:
"".getBytes("UTF8"),
src/test/org/apache/nutch/util/TestGZIPUtils.java:121: byte[] testBytes=
SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:123: testBytes=
LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:125: testBytes=
WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:130: byte[] testBytes=
SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:132: testBytes=
LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:134: testBytes=
WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:139: byte[] testBytes=
SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:141: testBytes=
LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:143: testBytes=
WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:148: byte[] testBytes=
SHORT_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:150: testBytes=
LONGER_TEST_STRING.getBytes();
src/test/org/apache/nutch/util/TestGZIPUtils.java:152: testBytes=
WEBPAGE.getBytes();
src/test/org/apache/nutch/util/TestEncodingDetector.java:35:
contentInOctets = "çñôöøДЛжҶ".getBytes("utf-8");
src/test/org/apache/nutch/util/TestNodeWalker.java:65: parser.parse(new
InputSource(new ByteArrayInputStream(WEBPAGE.getBytes())));
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:58: byte[] bytes =
testA.getBytes("UTF-8");
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:62:
os.write(p.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:80:
os.write(link.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:86:
os.write(link.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:92:
os.write(link.getBytes());
src/java/org/apache/nutch/tools/proxy/FakeHandler.java:93:
os.write(testB.getBytes());
src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java:318: Content
content = new Content(urlStr, urlStr, bytes.getBytes(), contentType,
src/java/org/apache/nutch/tools/Benchmark.java:65:
os.write(url.getBytes());
src/java/org/apache/nutch/crawl/MD5Signature.java:35: if (data == null) data
= content.getUrl().getBytes();
src/java/org/apache/nutch/crawl/Generator.java:375: int hash1 =
hash(url1.getBytes(), 0, url1.getLength());
src/java/org/apache/nutch/crawl/Generator.java:376: int hash2 =
hash(url2.getBytes(), 0, url2.getLength());
src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java:512:
return new String(x).getBytes();
src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java:151:
byte[] bytes= tests[i].getBytes();
src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java:242:
new ByteArrayInputStream(testPages[i].getBytes()) ),
src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java:308:
return new String(x).getBytes();
src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java:92:
byte[] credBytes = (username + ":" + password).getBytes();
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:84:
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT);
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:93:
rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, MULTIPLE_AGENTS);
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:109:
rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, SINGLE_AGENT);
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java:113:
rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(),
CONTENT_TYPE, UNKNOWN_AGENT);
src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestRobotsMetaProcessor.java:152:
byte[] bytes= tests[i].getBytes();
src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestDOMContentUtils.java:187:
new InputSource(new ByteArrayInputStream(testPages[i].getBytes())),
src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java:128:
return new Content(URL, BASE, text.getBytes(), "text/html", meta,
NutchConfiguration.create());
src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java:82:
fos.write(expectedText.getBytes());
src/plugin/feed/src/java/org/apache/nutch/parse/feed/FeedParser.java:267:
new Content(link, link, text.getBytes(), contentType, contentMeta,
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java:154:
byte[] reqBytes= reqStr.toString().getBytes();
src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java:403:
in.unread(line.substring(pos).getBytes("UTF-8"));
src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java:67:
InputStream is=new ByteArrayInputStream(xml.toString().getBytes());
{code}
> TextMD5Signatue compute on textual content
> ------------------------------------------
>
> Key: NUTCH-1693
> URL: https://issues.apache.org/jira/browse/NUTCH-1693
> Project: Nutch
> Issue Type: New Feature
> Reporter: Tien Nguyen Manh
> Assignee: Markus Jelsma
> Priority: Minor
> Fix For: 2.3, 1.8
>
> Attachments: NUTCH-1693-trunk.patch, NUTCH-1693-trunk.patch,
> NUTCH-1693.patch
>
>
> I create a new MD5Signature that based on textual content. In our case we use
> boilerpipe to extract main text from content so this signature is more
> effective to deduplicate.
--
This message was sent by Atlassian JIRA
(v6.1.5#6160)