NUTCH-2264 Forbidden APIs are Checked at Build
Project: http://git-wip-us.apache.org/repos/asf/nutch/repo Commit: http://git-wip-us.apache.org/repos/asf/nutch/commit/a671540a Tree: http://git-wip-us.apache.org/repos/asf/nutch/tree/a671540a Diff: http://git-wip-us.apache.org/repos/asf/nutch/diff/a671540a Branch: refs/heads/2.x Commit: a671540a94d8afafd72a09396c97d9ede43a7ea2 Parents: 9ecdc9b Author: Furkan KAMACI <furkankam...@gmail.com> Authored: Mon Aug 29 18:24:50 2016 +0300 Committer: Furkan KAMACI <furkankam...@gmail.com> Committed: Mon Aug 29 18:44:25 2016 +0300 ---------------------------------------------------------------------- build.xml | 29 ++++++++++++++++++++ ivy/ivy.xml | 2 ++ .../org/apache/nutch/api/impl/JobWorker.java | 9 +++--- .../nutch/api/resources/AdminResource.java | 3 +- .../nutch/api/resources/SeedResource.java | 9 +++--- .../org/apache/nutch/crawl/DbUpdaterJob.java | 3 +- .../org/apache/nutch/crawl/GeneratorJob.java | 3 +- .../org/apache/nutch/crawl/InjectorJob.java | 5 ++-- .../org/apache/nutch/fetcher/FetcherJob.java | 3 +- .../apache/nutch/fetcher/FetcherReducer.java | 8 +++--- .../apache/nutch/host/HostDbUpdateReducer.java | 5 ++-- .../org/apache/nutch/host/HostInjectorJob.java | 3 +- .../org/apache/nutch/net/URLFilterChecker.java | 5 ++-- .../apache/nutch/net/URLNormalizerChecker.java | 5 ++-- src/java/org/apache/nutch/parse/ParseUtil.java | 8 ++++-- src/java/org/apache/nutch/parse/ParserJob.java | 3 +- src/java/org/apache/nutch/protocol/Content.java | 3 +- .../apache/nutch/protocol/RobotRulesParser.java | 6 ++-- src/java/org/apache/nutch/tools/Benchmark.java | 3 +- src/java/org/apache/nutch/tools/DmozParser.java | 5 ++-- .../org/apache/nutch/tools/ResolveUrls.java | 8 +++--- .../apache/nutch/tools/arc/ArcRecordReader.java | 3 +- .../apache/nutch/tools/proxy/FakeHandler.java | 11 ++++---- src/java/org/apache/nutch/util/Bytes.java | 3 +- .../org/apache/nutch/util/EncodingDetector.java | 7 +++-- src/java/org/apache/nutch/util/TimingUtil.java | 3 +- src/java/org/apache/nutch/util/URLUtil.java | 5 ++-- .../nutch/util/domain/DomainStatistics.java | 3 +- .../nutch/webui/client/impl/RemoteCommand.java | 6 ++-- .../creativecommons/nutch/CCParseFilter.java | 7 +++-- .../indexer/anchor/AnchorIndexingFilter.java | 3 +- .../nutch/indexer/html/HtmlIndexingFilter.java | 3 +- .../indexer/more/TestMoreIndexingFilter.java | 3 +- .../nutch/analysis/lang/HTMLLanguageParser.java | 9 +++--- .../analysis/lang/TestHTMLLanguageParser.java | 5 ++-- .../nutch/protocol/http/api/HttpBase.java | 3 +- .../protocol/http/api/HttpRobotRulesParser.java | 5 ++-- .../protocol/http/api/TestRobotRulesParser.java | 10 ++++--- .../nutch/urlfilter/api/RegexURLFilterBase.java | 6 ++-- .../urlfilter/api/RegexURLFilterBaseTest.java | 12 +++++--- .../nutch/microformats/reltag/RelTagParser.java | 3 +- .../microformats/reltag/TestRelTagParser.java | 3 +- .../nutch/parse/html/DOMContentUtils.java | 3 +- .../nutch/parse/html/HTMLMetaProcessor.java | 13 +++++---- .../nutch/parse/html/TestDOMContentUtils.java | 3 +- .../parse/html/TestRobotsMetaProcessor.java | 3 +- .../apache/nutch/parse/js/JSParseFilter.java | 5 ++-- .../nutch/parse/metatags/MetaTagsParser.java | 3 +- .../parse/metatags/TestMetaTagsParser.java | 7 +++-- .../nutch/parse/tika/DOMContentUtils.java | 4 ++- .../nutch/parse/tika/HTMLMetaProcessor.java | 13 +++++---- .../nutch/parse/tika/DOMContentUtilsTest.java | 3 +- .../nutch/parse/tika/TestImageMetadata.java | 5 ++-- .../org/apache/nutch/protocol/file/File.java | 3 +- .../nutch/protocol/file/FileResponse.java | 3 +- .../org/apache/nutch/protocol/ftp/Client.java | 3 +- .../java/org/apache/nutch/protocol/ftp/Ftp.java | 3 +- .../apache/nutch/protocol/ftp/FtpResponse.java | 5 ++-- .../nutch/protocol/ftp/FtpRobotRulesParser.java | 5 ++-- .../nutch/protocol/http/HttpResponse.java | 5 ++-- .../httpclient/HttpBasicAuthentication.java | 5 ++-- .../org/apache/nutch/protocol/sftp/Sftp.java | 3 +- .../scoring/opic/TestOPICScoringFilter.java | 8 +++++- .../nutch/collection/TestSubcollection.java | 3 +- .../nutch/urlfilter/domain/DomainURLFilter.java | 8 ++++-- .../nutch/urlfilter/prefix/PrefixURLFilter.java | 3 +- .../nutch/urlfilter/suffix/SuffixURLFilter.java | 15 ++++++---- .../urlnormalizer/basic/BasicURLNormalizer.java | 12 +++----- .../urlnormalizer/regex/RegexURLNormalizer.java | 7 +++-- .../regex/TestRegexURLNormalizer.java | 5 ++-- .../apache/nutch/parse/TestSitemapParser.java | 20 ++------------ .../apache/nutch/plugin/TestPluginSystem.java | 12 ++++---- .../apache/nutch/util/TestEncodingDetector.java | 12 ++++---- .../org/apache/nutch/util/TestGZIPUtils.java | 25 +++++++++-------- .../org/apache/nutch/util/TestNodeWalker.java | 3 +- 75 files changed, 286 insertions(+), 185 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/build.xml ---------------------------------------------------------------------- diff --git a/build.xml b/build.xml index 061d0b7..f051838 100644 --- a/build.xml +++ b/build.xml @@ -63,6 +63,35 @@ </path> <!-- ====================================================== --> + <!-- Forbidden APIs Targets --> + <!-- ====================================================== --> + + <!-- will be used by forbiddenapis --> + <path id="all-lib-classpath"> + <fileset dir="${build.lib.dir}"> + <include name="**/*.jar" /> + </fileset> + <fileset dir="${runtime.dir}/local/lib"> + <include name="**/*.jar" /> + </fileset> + <fileset dir="${runtime.dir}/local/plugins"> + <include name="**/*.jar" /> + </fileset> + </path> + + <!-- forbiddenapis target --> + <target name="precommit" depends="runtime, test"> + <!-- forbiddenapis task definition --> + <taskdef name="forbiddenapis" classname="de.thetaphi.forbiddenapis.ant.AntTask" classpath="${build.lib.dir}/forbiddenapis-2.2.jar"/> + + <forbiddenapis classpathref="all-lib-classpath" dir="${build.dir}" targetVersion="${javac.version}"> + <bundledsignatures name="jdk-unsafe"/> + <bundledsignatures name="jdk-deprecated"/> + <bundledsignatures name="jdk-non-portable"/> + </forbiddenapis> + </target> + + <!-- ====================================================== --> <!-- Stuff needed by all targets --> <!-- ====================================================== --> <target name="init" depends="ivy-init" description="--> stuff required by all targets"> http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/ivy/ivy.xml ---------------------------------------------------------------------- diff --git a/ivy/ivy.xml b/ivy/ivy.xml index db42162..e173e71 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -81,6 +81,8 @@ <dependency org="org.restlet.jse" name="org.restlet.ext.jaxrs" rev="2.2.3" conf="*->default" /> <dependency org="org.restlet.jee" name="org.restlet.ext.crypto" rev="2.2.3" conf="*->default" /> + <dependency org="de.thetaphi" name="forbiddenapis" rev="2.2" conf="*->default"/> + <!--artifacts needed for testing --> <dependency org="junit" name="junit" rev="4.11" conf="*->default" /> http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/api/impl/JobWorker.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/api/impl/JobWorker.java b/src/java/org/apache/nutch/api/impl/JobWorker.java index 9c7c5c2..8ac78cc 100644 --- a/src/java/org/apache/nutch/api/impl/JobWorker.java +++ b/src/java/org/apache/nutch/api/impl/JobWorker.java @@ -17,6 +17,7 @@ package org.apache.nutch.api.impl; import java.text.MessageFormat; +import java.util.Locale; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.api.model.request.JobConfig; @@ -49,11 +50,11 @@ public class JobWorker implements Runnable { private String generateId() { if (jobConfig.getCrawlId() == null) { - return MessageFormat.format("{0}-{1}-{2}", jobConfig.getConfId(), - jobConfig.getType(), String.valueOf(hashCode())); + return new MessageFormat("{0}-{1}-{2}", Locale.ROOT) + .format(new Object[] {jobConfig.getConfId(), jobConfig.getType(), String.valueOf(hashCode())}); } - return MessageFormat.format("{0}-{1}-{2}-{3}", jobConfig.getCrawlId(), - jobConfig.getConfId(), jobConfig.getType(), String.valueOf(hashCode())); + return new MessageFormat("{0}-{1}-{2}-{3}", Locale.ROOT) + .format(new Object[] {jobConfig.getCrawlId(), jobConfig.getConfId(), jobConfig.getType(), String.valueOf(hashCode())}); } @Override http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/api/resources/AdminResource.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/api/resources/AdminResource.java b/src/java/org/apache/nutch/api/resources/AdminResource.java index cfbf8d5..6e93f11 100644 --- a/src/java/org/apache/nutch/api/resources/AdminResource.java +++ b/src/java/org/apache/nutch/api/resources/AdminResource.java @@ -18,6 +18,7 @@ package org.apache.nutch.api.resources; import java.text.MessageFormat; import java.util.Date; +import java.util.Locale; import java.util.concurrent.TimeUnit; import javax.ws.rs.GET; @@ -72,7 +73,7 @@ public class AdminResource extends AbstractResource { } scheduleServerStop(); - return MessageFormat.format("Stopping in {0} seconds.", DELAY_SEC); + return new MessageFormat("Stopping in {0} seconds.", Locale.ROOT).format(DELAY_SEC); } private void scheduleServerStop() { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/api/resources/SeedResource.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/api/resources/SeedResource.java b/src/java/org/apache/nutch/api/resources/SeedResource.java index d7439e0..472c842 100644 --- a/src/java/org/apache/nutch/api/resources/SeedResource.java +++ b/src/java/org/apache/nutch/api/resources/SeedResource.java @@ -20,9 +20,10 @@ import static javax.ws.rs.core.Response.status; import java.io.BufferedWriter; import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; +import java.io.OutputStreamWriter; +import java.io.FileOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Collection; import javax.ws.rs.Consumes; @@ -90,9 +91,7 @@ public class SeedResource extends AbstractResource { private BufferedWriter getWriter(File seedFile) { try { - return new BufferedWriter(new FileWriter(seedFile)); - } catch (FileNotFoundException e) { - throw handleException(e); + return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(seedFile), StandardCharsets.UTF_8)); } catch (IOException e) { throw handleException(e); } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/crawl/DbUpdaterJob.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/DbUpdaterJob.java b/src/java/org/apache/nutch/crawl/DbUpdaterJob.java index 4b1618c..3885b68 100644 --- a/src/java/org/apache/nutch/crawl/DbUpdaterJob.java +++ b/src/java/org/apache/nutch/crawl/DbUpdaterJob.java @@ -19,6 +19,7 @@ package org.apache.nutch.crawl; import java.text.SimpleDateFormat; import java.util.Collection; import java.util.HashSet; +import java.util.Locale; import java.util.Map; import org.apache.avro.util.Utf8; @@ -129,7 +130,7 @@ public class DbUpdaterJob extends NutchTool implements Tool { private int updateTable(String crawlId, String batchId) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT); long start = System.currentTimeMillis(); LOG.info("DbUpdaterJob: starting at " + sdf.format(start)); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/crawl/GeneratorJob.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/GeneratorJob.java b/src/java/org/apache/nutch/crawl/GeneratorJob.java index e06a192..f47637f 100644 --- a/src/java/org/apache/nutch/crawl/GeneratorJob.java +++ b/src/java/org/apache/nutch/crawl/GeneratorJob.java @@ -25,6 +25,7 @@ import java.util.Map; import java.util.Random; import java.util.Set; import java.util.Collection; +import java.util.Locale; import org.apache.hadoop.mapreduce.Job; import org.slf4j.Logger; @@ -255,7 +256,7 @@ public class GeneratorJob extends NutchTool implements Tool { public String generate(long topN, long curTime, boolean filter, boolean norm, boolean sitemap) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT); long start = System.currentTimeMillis(); LOG.info("GeneratorJob: starting at {}", sdf.format(start)); LOG.info("GeneratorJob: Selecting best-scoring urls due for fetch."); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/crawl/InjectorJob.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/crawl/InjectorJob.java b/src/java/org/apache/nutch/crawl/InjectorJob.java index 5094b0f..df91a73 100644 --- a/src/java/org/apache/nutch/crawl/InjectorJob.java +++ b/src/java/org/apache/nutch/crawl/InjectorJob.java @@ -44,6 +44,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.text.SimpleDateFormat; import java.util.*; @@ -181,7 +182,7 @@ public class InjectorJob extends NutchTool implements Tool { String keymd = keysIter.next(); String valuemd = metadata.get(keymd); row.getMetadata().put(new Utf8(keymd), - ByteBuffer.wrap(valuemd.getBytes())); + ByteBuffer.wrap(valuemd.getBytes(StandardCharsets.UTF_8))); } if (customScore != -1) @@ -260,7 +261,7 @@ public class InjectorJob extends NutchTool implements Tool { } public void inject(Path urlDir) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT); long start = System.currentTimeMillis(); LOG.info("InjectorJob: starting at " + sdf.format(start)); LOG.info("InjectorJob: Injecting urlDir: " + urlDir); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/fetcher/FetcherJob.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/fetcher/FetcherJob.java b/src/java/org/apache/nutch/fetcher/FetcherJob.java index 268d9f6..a7f3df8 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherJob.java +++ b/src/java/org/apache/nutch/fetcher/FetcherJob.java @@ -22,6 +22,7 @@ import java.util.Collection; import java.util.HashSet; import java.util.Map; import java.util.Random; +import java.util.Locale; import org.apache.avro.util.Utf8; import org.apache.gora.filter.FilterOp; import org.apache.gora.filter.MapFieldValueFilter; @@ -278,7 +279,7 @@ public class FetcherJob extends NutchTool implements Tool { public int fetch(String batchId, int threads, boolean shouldResume, int numTasks, boolean stmDetect, boolean sitemap) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT); long start = System.currentTimeMillis(); LOG.info("FetcherJob: starting at " + sdf.format(start)); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/fetcher/FetcherReducer.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/fetcher/FetcherReducer.java b/src/java/org/apache/nutch/fetcher/FetcherReducer.java index 8ee7477..68b982d 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherReducer.java +++ b/src/java/org/apache/nutch/fetcher/FetcherReducer.java @@ -106,7 +106,7 @@ public class FetcherReducer extends LOG.warn("Cannot parse url: " + url, e); return null; } - final String proto = u.getProtocol().toLowerCase(); + final String proto = u.getProtocol().toLowerCase(Locale.ROOT); String host; if (FetchItemQueues.QUEUE_MODE_IP.equalsIgnoreCase(queueMode)) { try { @@ -131,7 +131,7 @@ public class FetcherReducer extends host = u.toExternalForm(); } } - queueID = proto + "://" + host.toLowerCase(); + queueID = proto + "://" + host.toLowerCase(Locale.ROOT); return new FetchItem(url, page, u, queueID); } @@ -639,8 +639,8 @@ public class FetcherReducer extends } if (ignoreExternalLinks) { - String toHost = new URL(newUrl).getHost().toLowerCase(); - String fromHost = new URL(url).getHost().toLowerCase(); + String toHost = new URL(newUrl).getHost().toLowerCase(Locale.ROOT); + String fromHost = new URL(url).getHost().toLowerCase(Locale.ROOT); if (toHost == null || !toHost.equals(fromHost)) { // external links return; http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/host/HostDbUpdateReducer.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/host/HostDbUpdateReducer.java b/src/java/org/apache/nutch/host/HostDbUpdateReducer.java index 933f546..3043543 100644 --- a/src/java/org/apache/nutch/host/HostDbUpdateReducer.java +++ b/src/java/org/apache/nutch/host/HostDbUpdateReducer.java @@ -27,6 +27,7 @@ import org.apache.nutch.util.URLUtil; import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.Set; /** @@ -78,10 +79,10 @@ public class HostDbUpdateReducer extends // output host data Host host = new Host(); host.getMetadata().put(new Utf8("p"), - ByteBuffer.wrap(Integer.toString(numPages).getBytes())); + ByteBuffer.wrap(Integer.toString(numPages).getBytes(StandardCharsets.UTF_8))); if (numFetched > 0) { host.getMetadata().put(new Utf8("f"), - ByteBuffer.wrap(Integer.toString(numFetched).getBytes())); + ByteBuffer.wrap(Integer.toString(numFetched).getBytes(StandardCharsets.UTF_8))); } for (String inlink : inlinkCount.getKeys()) { host.getInlinks().put(new Utf8(inlink), http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/host/HostInjectorJob.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/host/HostInjectorJob.java b/src/java/org/apache/nutch/host/HostInjectorJob.java index 12cdf28..83f247c 100644 --- a/src/java/org/apache/nutch/host/HostInjectorJob.java +++ b/src/java/org/apache/nutch/host/HostInjectorJob.java @@ -40,6 +40,7 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URL; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.*; /** @@ -124,7 +125,7 @@ public class HostInjectorJob implements Tool { String keymd = keysIter.next(); String valuemd = metadata.get(keymd); host.getMetadata().put(new Utf8(keymd), - ByteBuffer.wrap(valuemd.getBytes())); + ByteBuffer.wrap(valuemd.getBytes(StandardCharsets.UTF_8))); } String hostname; if (url.indexOf("://") > -1) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/net/URLFilterChecker.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/net/URLFilterChecker.java b/src/java/org/apache/nutch/net/URLFilterChecker.java index 21ffd03..ee4daf7 100644 --- a/src/java/org/apache/nutch/net/URLFilterChecker.java +++ b/src/java/org/apache/nutch/net/URLFilterChecker.java @@ -27,6 +27,7 @@ import org.apache.nutch.util.NutchConfiguration; import java.io.BufferedReader; import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; /** * Checks one given filter or all filters. @@ -71,7 +72,7 @@ public class URLFilterChecker { System.out.println("Checking URLFilter " + filterName); - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); String line; while ((line = in.readLine()) != null) { String out = filter.filter(line); @@ -88,7 +89,7 @@ public class URLFilterChecker { private void checkAll() throws Exception { System.out.println("Checking combination of all URLFilters available"); - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); String line; while ((line = in.readLine()) != null) { URLFilters filters = new URLFilters(this.conf); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/net/URLNormalizerChecker.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/net/URLNormalizerChecker.java b/src/java/org/apache/nutch/net/URLNormalizerChecker.java index d8f1c6e..b1ec60f 100644 --- a/src/java/org/apache/nutch/net/URLNormalizerChecker.java +++ b/src/java/org/apache/nutch/net/URLNormalizerChecker.java @@ -27,6 +27,7 @@ import org.apache.nutch.util.NutchConfiguration; import java.io.BufferedReader; import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; /** * Checks one given normalizer or all normalizers. @@ -66,7 +67,7 @@ public class URLNormalizerChecker { System.out.println("Checking URLNormalizer " + normalizerName); - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); String line; while ((line = in.readLine()) != null) { String out = normalizer.normalize(line, scope); @@ -77,7 +78,7 @@ public class URLNormalizerChecker { private void checkAll(String scope) throws Exception { System.out.println("Checking combination of all URLNormalizers available"); - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); String line; URLNormalizers normalizers = new URLNormalizers(conf, scope); while ((line = in.readLine()) != null) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/parse/ParseUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/parse/ParseUtil.java b/src/java/org/apache/nutch/parse/ParseUtil.java index 8a37063..a38fb0a 100644 --- a/src/java/org/apache/nutch/parse/ParseUtil.java +++ b/src/java/org/apache/nutch/parse/ParseUtil.java @@ -45,6 +45,8 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.concurrent.ExecutorService; @@ -243,7 +245,7 @@ public class ParseUtil extends Configured { for (Map.Entry<String, String[]> metadata : metaDatas) { System.out.println(); newRow.getMetadata().put(new Utf8(metadata.getKey()), - ByteBuffer.wrap(metadata.getValue()[0].getBytes())); + ByteBuffer.wrap(metadata.getValue()[0].getBytes(StandardCharsets.UTF_8))); } int changeFrequency = calculateFetchInterval( @@ -362,7 +364,7 @@ public class ParseUtil extends Configured { String fromHost; if (ignoreExternalLinks) { try { - fromHost = new URL(url).getHost().toLowerCase(); + fromHost = new URL(url).getHost().toLowerCase(Locale.ROOT); } catch (final MalformedURLException e) { fromHost = null; } @@ -382,7 +384,7 @@ public class ParseUtil extends Configured { String toHost; if (ignoreExternalLinks) { try { - toHost = new URL(toUrl).getHost().toLowerCase(); + toHost = new URL(toUrl).getHost().toLowerCase(Locale.ROOT); } catch (final MalformedURLException e) { toHost = null; } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/parse/ParserJob.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/parse/ParserJob.java b/src/java/org/apache/nutch/parse/ParserJob.java index a021879..9762a00 100644 --- a/src/java/org/apache/nutch/parse/ParserJob.java +++ b/src/java/org/apache/nutch/parse/ParserJob.java @@ -21,6 +21,7 @@ import java.nio.ByteBuffer; import java.text.SimpleDateFormat; import java.util.Collection; import java.util.HashSet; +import java.util.Locale; import java.util.Map; import org.apache.avro.util.Utf8; @@ -304,7 +305,7 @@ public class ParserJob extends NutchTool implements Tool { boolean sitemap) throws Exception { - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT); long start = System.currentTimeMillis(); LOG.info("ParserJob: starting at {}", sdf.format(start)); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/protocol/Content.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/protocol/Content.java b/src/java/org/apache/nutch/protocol/Content.java index f4c4098..77f9b51 100755 --- a/src/java/org/apache/nutch/protocol/Content.java +++ b/src/java/org/apache/nutch/protocol/Content.java @@ -23,6 +23,7 @@ import java.io.DataInput; import java.io.DataInputStream; import java.io.DataOutput; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.zip.InflaterInputStream; @@ -265,7 +266,7 @@ public final class Content implements Writable { buffer.append("contentType: " + contentType + "\n"); buffer.append("metadata: " + metadata + "\n"); buffer.append("Content:\n"); - buffer.append(new String(content)); // try default encoding + buffer.append(new String(content, StandardCharsets.UTF_8)); // try default encoding return buffer.toString(); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/protocol/RobotRulesParser.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/protocol/RobotRulesParser.java b/src/java/org/apache/nutch/protocol/RobotRulesParser.java index 16e380d..867b71b 100644 --- a/src/java/org/apache/nutch/protocol/RobotRulesParser.java +++ b/src/java/org/apache/nutch/protocol/RobotRulesParser.java @@ -19,9 +19,11 @@ package org.apache.nutch.protocol; // JDK imports import java.io.File; -import java.io.FileReader; +import java.io.FileInputStream; +import java.io.InputStreamReader; import java.io.LineNumberReader; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.util.Hashtable; import java.util.StringTokenizer; @@ -172,7 +174,7 @@ public abstract class RobotRulesParser implements Configurable { BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", argv[2]); - LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1])); + LineNumberReader testsIn = new LineNumberReader(new InputStreamReader(new FileInputStream(argv[1]), StandardCharsets.UTF_8)); String testPath = testsIn.readLine().trim(); while (testPath != null) { System.out.println((rules.isAllowed(testPath) ? "allowed" http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/Benchmark.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/Benchmark.java b/src/java/org/apache/nutch/tools/Benchmark.java index 6643ba3..68c1755 100644 --- a/src/java/org/apache/nutch/tools/Benchmark.java +++ b/src/java/org/apache/nutch/tools/Benchmark.java @@ -17,6 +17,7 @@ package org.apache.nutch.tools; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -55,7 +56,7 @@ public class Benchmark extends Configured implements Tool { OutputStream os = fs.create(new Path(seedsDir, "seeds")); for (int i = 0; i < count; i++) { String url = "http://www.test-" + i + ".com/\r\n"; - os.write(url.getBytes()); + os.write(url.getBytes(StandardCharsets.UTF_8)); } os.flush(); os.close(); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/DmozParser.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/DmozParser.java b/src/java/org/apache/nutch/tools/DmozParser.java index ae63505..03d2662 100644 --- a/src/java/org/apache/nutch/tools/DmozParser.java +++ b/src/java/org/apache/nutch/tools/DmozParser.java @@ -19,6 +19,7 @@ package org.apache.nutch.tools; import java.io.*; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.*; import java.util.regex.*; @@ -195,12 +196,12 @@ public class DmozParser { if (row != null) { if (desc.length() > 0) { row.getMetadata().put(new Utf8("_dmoz_desc_"), - ByteBuffer.wrap(desc.toString().getBytes())); + ByteBuffer.wrap(desc.toString().getBytes(StandardCharsets.UTF_8))); desc.delete(0, desc.length()); } if (title.length() > 0) { row.getMetadata().put(new Utf8("_dmoz_title_"), - ByteBuffer.wrap(title.toString().getBytes())); + ByteBuffer.wrap(title.toString().getBytes(StandardCharsets.UTF_8))); title.delete(0, title.length()); } store.put(reversedUrl, row); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/ResolveUrls.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/ResolveUrls.java b/src/java/org/apache/nutch/tools/ResolveUrls.java index fe8c24f..8c8bf97 100644 --- a/src/java/org/apache/nutch/tools/ResolveUrls.java +++ b/src/java/org/apache/nutch/tools/ResolveUrls.java @@ -17,9 +17,10 @@ package org.apache.nutch.tools; import java.io.BufferedReader; -import java.io.File; -import java.io.FileReader; +import java.io.FileInputStream; +import java.io.InputStreamReader; import java.net.InetAddress; +import java.nio.charset.StandardCharsets; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; @@ -102,8 +103,7 @@ public class ResolveUrls { pool = Executors.newFixedThreadPool(numThreads); // read in the urls file and loop through each line, one url per line - BufferedReader buffRead = new BufferedReader(new FileReader(new File( - urlsFile))); + BufferedReader buffRead = new BufferedReader(new InputStreamReader(new FileInputStream(urlsFile), StandardCharsets.UTF_8)); String urlStr = null; while ((urlStr = buffRead.readLine()) != null) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java index 2b6a3f9..d3f9799 100644 --- a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java +++ b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java @@ -18,6 +18,7 @@ package org.apache.nutch.tools.arc; import java.io.ByteArrayOutputStream; import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.util.zip.GZIPInputStream; import org.slf4j.Logger; @@ -269,7 +270,7 @@ public class ArcRecordReader implements RecordReader<Text, BytesWritable> { } // create the header and the raw content minus the header - String header = new String(content, 0, eol).trim(); + String header = new String(content, 0, eol, StandardCharsets.UTF_8).trim(); byte[] raw = new byte[(content.length - eol) - 1]; System.arraycopy(content, eol + 1, raw, 0, raw.length); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/tools/proxy/FakeHandler.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/tools/proxy/FakeHandler.java b/src/java/org/apache/nutch/tools/proxy/FakeHandler.java index fce2d3b..699cfa3 100644 --- a/src/java/org/apache/nutch/tools/proxy/FakeHandler.java +++ b/src/java/org/apache/nutch/tools/proxy/FakeHandler.java @@ -35,6 +35,7 @@ package org.apache.nutch.tools.proxy; import java.io.IOException; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.util.Random; import java.util.concurrent.atomic.AtomicLong; @@ -118,7 +119,7 @@ public class FakeHandler extends AbstractTestbedHandler { os.write(bytes); // record URI String p = "<p>URI: " + uri + "</p>\r\n"; - os.write(p.getBytes()); + os.write(p.getBytes(StandardCharsets.UTF_8)); // fake some links String basePath; String baseDomain; @@ -142,7 +143,7 @@ public class FakeHandler extends AbstractTestbedHandler { link += pageSeq.getAndIncrement() + ".html'>"; } link += "outlink " + i + "</a></p>\r\n"; - os.write(link.getBytes()); + os.write(link.getBytes(StandardCharsets.UTF_8)); } baseDomain = u.getHost(); // chop off the TLD @@ -160,15 +161,15 @@ public class FakeHandler extends AbstractTestbedHandler { link = "http://" + host + "/"; } link = "<p><a href='" + link + "'>fake host " + host + "</a></p>\r\n"; - os.write(link.getBytes()); + os.write(link.getBytes(StandardCharsets.UTF_8)); } // fake a link to the root URL link = "<p><a href='" + u.getScheme() + "://" + u.getHost(); if (u.getPort() != 80 && u.getPort() != -1) link += ":" + u.getPort(); link += "/'>site " + u.getHost() + "</a></p>\r\n"; - os.write(link.getBytes()); - os.write(testB.getBytes()); + os.write(link.getBytes(StandardCharsets.UTF_8)); + os.write(testB.getBytes(StandardCharsets.UTF_8)); res.flushBuffer(); } catch (IOException ioe) { } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/Bytes.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/Bytes.java b/src/java/org/apache/nutch/util/Bytes.java index 87323a6..db9f468 100644 --- a/src/java/org/apache/nutch/util/Bytes.java +++ b/src/java/org/apache/nutch/util/Bytes.java @@ -28,6 +28,7 @@ import java.math.BigInteger; import java.nio.ByteBuffer; import java.util.Comparator; import java.util.Iterator; +import java.util.Locale; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -396,7 +397,7 @@ public class Bytes { || " `~!@#$%^&*()-_=+[]{}\\|;:'\",.<>/?".indexOf(ch) >= 0) { result.append(first.charAt(i)); } else { - result.append(String.format("\\x%02X", ch)); + result.append(String.format(Locale.ROOT, "\\x%02X", ch)); } } } catch (UnsupportedEncodingException e) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/EncodingDetector.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/EncodingDetector.java b/src/java/org/apache/nutch/util/EncodingDetector.java index ff6cf00..5b40e29 100644 --- a/src/java/org/apache/nutch/util/EncodingDetector.java +++ b/src/java/org/apache/nutch/util/EncodingDetector.java @@ -32,6 +32,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; /** * A simple class for detecting character encodings. @@ -72,7 +73,7 @@ public class EncodingDetector { } public EncodingClue(String value, String source, int confidence) { - this.value = value.toLowerCase(); + this.value = value.toLowerCase(Locale.ROOT); this.source = source; this.confidence = confidence; } @@ -269,7 +270,7 @@ public class EncodingDetector { LOG.trace(baseUrl + ": Choosing encoding: " + charset + " with confidence " + clue.confidence); } - return resolveEncodingAlias(charset).toLowerCase(); + return resolveEncodingAlias(charset).toLowerCase(Locale.ROOT); } else if (clue.confidence == NO_THRESHOLD && bestClue == defaultClue) { bestClue = clue; } @@ -278,7 +279,7 @@ public class EncodingDetector { if (LOG.isTraceEnabled()) { LOG.trace(baseUrl + ": Choosing encoding: " + bestClue); } - return bestClue.value.toLowerCase(); + return bestClue.value.toLowerCase(Locale.ROOT); } /** Clears all clues. */ http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/TimingUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/TimingUtil.java b/src/java/org/apache/nutch/util/TimingUtil.java index 8f77969..524bee6 100644 --- a/src/java/org/apache/nutch/util/TimingUtil.java +++ b/src/java/org/apache/nutch/util/TimingUtil.java @@ -18,6 +18,7 @@ package org.apache.nutch.util; import java.text.NumberFormat; +import java.util.Locale; public class TimingUtil { @@ -45,7 +46,7 @@ public class TimingUtil { start += TIME_FACTOR[i] * elapsedTime[i]; } - NumberFormat nf = NumberFormat.getInstance(); + NumberFormat nf = NumberFormat.getInstance(Locale.ROOT); nf.setMinimumIntegerDigits(2); StringBuffer buf = new StringBuffer(); for (int i = 0; i < elapsedTime.length; i++) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/URLUtil.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/URLUtil.java b/src/java/org/apache/nutch/util/URLUtil.java index df16423..5183ba1 100644 --- a/src/java/org/apache/nutch/util/URLUtil.java +++ b/src/java/org/apache/nutch/util/URLUtil.java @@ -19,6 +19,7 @@ package org.apache.nutch.util; import java.net.MalformedURLException; import java.net.*; +import java.util.Locale; import java.util.regex.Pattern; import org.apache.nutch.util.domain.DomainSuffix; @@ -386,7 +387,7 @@ public class URLUtil { */ public static String getHost(String url) { try { - return new URL(url).getHost().toLowerCase(); + return new URL(url).getHost().toLowerCase(Locale.ROOT); } catch (MalformedURLException e) { return null; } @@ -404,7 +405,7 @@ public class URLUtil { public static String getPage(String url) { try { // get the full url, and replace the query string with and empty string - url = url.toLowerCase(); + url = url.toLowerCase(Locale.ROOT); String queryStr = new URL(url).getQuery(); return (queryStr != null) ? url.replace("?" + queryStr, "") : url; } catch (MalformedURLException e) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/util/domain/DomainStatistics.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/util/domain/DomainStatistics.java b/src/java/org/apache/nutch/util/domain/DomainStatistics.java index 57eb81e..7313a03 100644 --- a/src/java/org/apache/nutch/util/domain/DomainStatistics.java +++ b/src/java/org/apache/nutch/util/domain/DomainStatistics.java @@ -20,6 +20,7 @@ package org.apache.nutch.util.domain; import java.io.IOException; import java.net.URL; import java.text.SimpleDateFormat; +import java.util.Locale; import org.apache.gora.mapreduce.GoraMapper; import org.apache.gora.query.Query; @@ -97,7 +98,7 @@ public class DomainStatistics extends Configured implements Tool { } } - SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss", Locale.ROOT); long start = System.currentTimeMillis(); LOG.info("DomainStatistics: starting at " + sdf.format(start)); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java ---------------------------------------------------------------------- diff --git a/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java b/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java index ea19a8a..107771a 100644 --- a/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java +++ b/src/java/org/apache/nutch/webui/client/impl/RemoteCommand.java @@ -18,6 +18,7 @@ package org.apache.nutch.webui.client.impl; import java.io.Serializable; import java.text.MessageFormat; +import java.util.Locale; import org.apache.commons.lang3.StringUtils; import org.apache.nutch.webui.client.model.JobConfig; @@ -68,9 +69,8 @@ public class RemoteCommand implements Serializable { public String toString() { String statusInfo = StringUtils.EMPTY; if (jobInfo != null) { - statusInfo = MessageFormat.format("{0}", jobInfo.getState()); + statusInfo = new MessageFormat("{0}", Locale.ROOT).format(jobInfo.getState()); } - return MessageFormat.format("{0} status: {1}", jobConfig.getType(), - statusInfo); + return new MessageFormat("{0} status: {1}", Locale.ROOT).format(new Object[] {jobConfig.getType(), statusInfo}); } } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java index f8db384..be427ef 100644 --- a/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java +++ b/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java @@ -37,6 +37,7 @@ import java.nio.ByteBuffer; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; +import java.nio.charset.StandardCharsets; /** Adds metadata identifying the Creative Commons license used, if any. */ public class CCParseFilter implements ParseFilter { @@ -87,9 +88,9 @@ public class CCParseFilter implements ParseFilter { + " of " + base); } page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_URL), - ByteBuffer.wrap(licenseUrl.getBytes())); + ByteBuffer.wrap(licenseUrl.getBytes(StandardCharsets.UTF_8))); page.getMetadata().put(new Utf8(CreativeCommons.LICENSE_LOCATION), - ByteBuffer.wrap(licenseLocation.getBytes())); + ByteBuffer.wrap(licenseLocation.getBytes(StandardCharsets.UTF_8))); } if (walker.workType != null) { @@ -97,7 +98,7 @@ public class CCParseFilter implements ParseFilter { LOG.debug("CC: found " + walker.workType + " in " + base); } page.getMetadata().put(new Utf8(CreativeCommons.WORK_TYPE), - ByteBuffer.wrap(walker.workType.getBytes())); + ByteBuffer.wrap(walker.workType.getBytes(StandardCharsets.UTF_8))); } } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java index 25149be..9e2e75b 100644 --- a/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java +++ b/src/plugin/index-anchor/src/java/org/apache/nutch/indexer/anchor/AnchorIndexingFilter.java @@ -30,6 +30,7 @@ import java.lang.CharSequence; import java.util.Collection; import java.util.HashSet; import java.util.Map.Entry; +import java.util.Locale; /** * Indexing filter that offers an option to either index all inbound anchor text @@ -97,7 +98,7 @@ public class AnchorIndexingFilter implements IndexingFilter { if (deduplicate) { if (set == null) set = new HashSet<String>(); - String lcAnchor = anchor.toLowerCase(); + String lcAnchor = anchor.toLowerCase(Locale.ROOT); // Check if already processed the current anchor if (!set.contains(lcAnchor)) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java index 6db3bea..eb1454b 100644 --- a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java +++ b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java @@ -18,6 +18,7 @@ package org.apache.nutch.indexer.html; import java.io.ByteArrayInputStream; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.HashSet; import java.util.Scanner; @@ -67,7 +68,7 @@ public class HtmlIndexingFilter implements IndexingFilter { LOG.info("Html indexing for: " + url.toString()); } ByteArrayInputStream arrayInputStream = new ByteArrayInputStream(raw.array(), raw.arrayOffset() + raw.position(), raw.remaining()); - Scanner scanner = new Scanner(arrayInputStream); + Scanner scanner = new Scanner(arrayInputStream, StandardCharsets.UTF_8.name()); scanner.useDelimiter("\\Z");//To read all scanner content in one String String data = ""; if (scanner.hasNext()) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java b/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java index 2e9da51..206831d 100644 --- a/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java +++ b/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java @@ -26,6 +26,7 @@ import org.apache.nutch.util.NutchConfiguration; import org.junit.Test; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import static org.junit.Assert.*; @@ -81,7 +82,7 @@ public class TestMoreIndexingFilter { filter.setConf(conf); WebPage page = WebPage.newBuilder().build(); String url = "http://www.example.com/"; - page.setContent(ByteBuffer.wrap("text".getBytes())); + page.setContent(ByteBuffer.wrap("text".getBytes(StandardCharsets.UTF_8))); page.setTitle(new Utf8("title")); page.getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8(source)); NutchDocument doc = filter.filter(new NutchDocument(), url, page); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java index ee0560b..f3af6a9 100644 --- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java +++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/HTMLLanguageParser.java @@ -39,6 +39,7 @@ import org.w3c.dom.Node; import java.lang.CharSequence; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.*; /** @@ -67,7 +68,7 @@ public class HTMLLanguageParser implements ParseFilter { String[] values = p.getProperty(key).split(",", -1); LANGUAGES_MAP.put(key, key); for (int i = 0; i < values.length; i++) { - LANGUAGES_MAP.put(values[i].trim().toLowerCase(), key); + LANGUAGES_MAP.put(values[i].trim().toLowerCase(Locale.ROOT), key); } } } catch (Exception e) { @@ -115,7 +116,7 @@ public class HTMLLanguageParser implements ParseFilter { if (lang != null) { page.getMetadata().put(new Utf8(Metadata.LANGUAGE), - ByteBuffer.wrap(lang.getBytes())); + ByteBuffer.wrap(lang.getBytes(StandardCharsets.UTF_8))); return parse; } @@ -255,7 +256,7 @@ public class HTMLLanguageParser implements ParseFilter { Node attrnode = attrs.item(i); if ("http-equiv".equalsIgnoreCase(attrnode.getNodeName())) { if ("content-language".equals(attrnode.getNodeValue() - .toLowerCase())) { + .toLowerCase(Locale.ROOT))) { Node valueattr = attrs.getNamedItem("content"); if (valueattr != null) { httpEquiv = parseLanguage(valueattr.getNodeValue()); @@ -296,7 +297,7 @@ public class HTMLLanguageParser implements ParseFilter { code = langs[i].split("-")[0]; code = code.split("_")[0]; // Find the ISO 639 code - language = (String) LANGUAGES_MAP.get(code.toLowerCase()); + language = (String) LANGUAGES_MAP.get(code.toLowerCase(Locale.ROOT)); i++; } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java b/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java index c98c00f..1432999 100644 --- a/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java +++ b/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java @@ -31,6 +31,7 @@ import org.junit.Test; import java.io.BufferedReader; import java.io.InputStreamReader; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import static org.junit.Assert.assertEquals; import static org.junit.Assert.fail; @@ -107,7 +108,7 @@ public class TestHTMLLanguageParser { long total = 0; LanguageIdentifier identifier; BufferedReader in = new BufferedReader(new InputStreamReader(this - .getClass().getResourceAsStream("test-referencial.txt"))); + .getClass().getResourceAsStream("test-referencial.txt"), StandardCharsets.UTF_8)); String line = null; while ((line = in.readLine()) != null) { String[] tokens = line.split(";"); @@ -149,7 +150,7 @@ public class TestHTMLLanguageParser { private WebPage getPage(String text) { WebPage page = WebPage.newBuilder().build(); page.setBaseUrl(BASE); - page.setContent(ByteBuffer.wrap(text.getBytes())); + page.setContent(ByteBuffer.wrap(text.getBytes(StandardCharsets.UTF_8))); page.setContentType(new Utf8("text/html")); page.getHeaders().put(EncodingDetector.CONTENT_TYPE_UTF8, new Utf8("text/html")); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java index d0a4726..0a6121b 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.Reader; import java.net.URL; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; @@ -516,7 +517,7 @@ public abstract class HttpBase implements Protocol { System.out.println("Content Length: " + content.getMetadata().get(Response.CONTENT_LENGTH)); System.out.println("Content:"); - String text = new String(content.getContent()); + String text = new String(content.getContent(), StandardCharsets.UTF_8); System.out.println(text); } } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java index 1d6ea55..bd64d76 100644 --- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java +++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java @@ -28,6 +28,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URL; +import java.util.Locale; /** * This class is used for parsing robots for urls belonging to HTTP protocol. It @@ -50,9 +51,9 @@ public class HttpRobotRulesParser extends RobotRulesParser { /** Compose unique key to store and access robot rules in cache for given URL */ protected static String getCacheKey(URL url) { - String protocol = url.getProtocol().toLowerCase(); // normalize to lower + String protocol = url.getProtocol().toLowerCase(Locale.ROOT); // normalize to lower // case - String host = url.getHost().toLowerCase(); // normalize to lower case + String host = url.getHost().toLowerCase(Locale.ROOT); // normalize to lower case int port = url.getPort(); if (port == -1) { port = url.getDefaultPort(); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java index 47b41a3..8d033e9 100644 --- a/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java +++ b/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java @@ -17,6 +17,8 @@ package org.apache.nutch.protocol.http.api; +import java.nio.charset.StandardCharsets; + import org.junit.Before; import org.junit.Test; @@ -80,7 +82,7 @@ public class TestRobotRulesParser { */ @Test public void testRobotsAgent() { - rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), + rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8), CONTENT_TYPE, SINGLE_AGENT); for (int counter = 0; counter < TEST_PATHS.length; counter++) { @@ -91,7 +93,7 @@ public class TestRobotRulesParser { rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]); } - rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), + rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8), CONTENT_TYPE, MULTIPLE_AGENTS); for (int counter = 0; counter < TEST_PATHS.length; counter++) { @@ -112,13 +114,13 @@ public class TestRobotRulesParser { public void testCrawlDelay() { // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be // returned by the parser - rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), + rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8), CONTENT_TYPE, SINGLE_AGENT); assertTrue("testing crawl delay for agent " + SINGLE_AGENT + " : ", (rules.getCrawlDelay() == 10000)); // for UNKNOWN_AGENT, the default crawl delay must be returned. - rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), + rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(StandardCharsets.UTF_8), CONTENT_TYPE, UNKNOWN_AGENT); assertTrue("testing crawl delay for agent " + UNKNOWN_AGENT + " : ", (rules.getCrawlDelay() == Long.MIN_VALUE)); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java index 40ba266..d374e95 100644 --- a/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java +++ b/src/plugin/lib-regex-filter/src/java/org/apache/nutch/urlfilter/api/RegexURLFilterBase.java @@ -22,8 +22,10 @@ import java.io.Reader; import java.io.FileReader; import java.io.BufferedReader; import java.io.InputStreamReader; +import java.io.FileInputStream; import java.io.IOException; import java.io.StringReader; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.ArrayList; @@ -82,7 +84,7 @@ public abstract class RegexURLFilterBase implements URLFilter { */ public RegexURLFilterBase(File filename) throws IOException, IllegalArgumentException { - this(new FileReader(filename)); + this(new InputStreamReader(new FileInputStream(filename), StandardCharsets.UTF_8)); } /** @@ -245,7 +247,7 @@ public abstract class RegexURLFilterBase implements URLFilter { public static void main(RegexURLFilterBase filter, String args[]) throws IOException, IllegalArgumentException { - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); String line; while ((line = in.readLine()) != null) { String out = filter.filter(line); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java ---------------------------------------------------------------------- diff --git a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java index 2b40b48..ae4660f 100644 --- a/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java +++ b/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java @@ -21,6 +21,10 @@ import java.io.BufferedReader; import java.io.FileReader; import java.io.IOException; import java.io.Reader; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.FileInputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.List; @@ -57,8 +61,8 @@ public abstract class RegexURLFilterBaseTest { protected void bench(int loops, String file) { try { - bench(loops, new FileReader(SAMPLES + SEPARATOR + file + ".rules"), - new FileReader(SAMPLES + SEPARATOR + file + ".urls")); + bench(loops, new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".rules"), StandardCharsets.UTF_8), + new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".urls"), StandardCharsets.UTF_8)); } catch (Exception e) { fail(e.toString()); } @@ -81,8 +85,8 @@ public abstract class RegexURLFilterBaseTest { protected void test(String file) { try { - test(new FileReader(SAMPLES + SEPARATOR + file + ".rules"), - new FileReader(SAMPLES + SEPARATOR + file + ".urls")); + test(new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".rules"), StandardCharsets.UTF_8), + new InputStreamReader(new FileInputStream(SAMPLES + SEPARATOR + file + ".urls"), StandardCharsets.UTF_8)); } catch (Exception e) { fail(e.toString()); } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java index f71c5ab..00fa30d 100644 --- a/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java +++ b/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java @@ -20,6 +20,7 @@ package org.apache.nutch.microformats.reltag; import java.net.URL; import java.net.URLDecoder; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; @@ -171,7 +172,7 @@ public class RelTagParser implements ParseFilter { sb.append(iter.next()); sb.append("\t"); } - ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes()); + ByteBuffer bb = ByteBuffer.wrap(sb.toString().getBytes(StandardCharsets.UTF_8)); page.getMetadata().put(new Utf8(REL_TAG), bb); return parse; } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java b/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java index 064b46b..66964de 100644 --- a/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java +++ b/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java @@ -34,6 +34,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import static org.junit.Assert.assertEquals; @@ -90,7 +91,7 @@ public class TestRelTagParser { ByteBuffer bbuf = page.getMetadata().get(new Utf8("Rel-Tag")); byte[] byteArray = new byte[bbuf.remaining()]; bbuf.get(byteArray); - String s = new String(byteArray); + String s = new String(byteArray, StandardCharsets.UTF_8); // bbuf.flip(); assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter", expectedRelTags, s); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java index 3ba3716..8e079fb 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java @@ -22,6 +22,7 @@ import java.net.MalformedURLException; import java.util.Collection; import java.util.ArrayList; import java.util.HashMap; +import java.util.Locale; import org.apache.nutch.parse.Outlink; import org.apache.nutch.util.NodeWalker; @@ -320,7 +321,7 @@ public class DOMContentUtils { if (nodeType == Node.ELEMENT_NODE) { - nodeName = nodeName.toLowerCase(); + nodeName = nodeName.toLowerCase(Locale.ROOT); LinkParams params = linkParams.get(nodeName); if (params != null) { if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java index 159aa76..3e066c4 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java @@ -18,6 +18,7 @@ package org.apache.nutch.parse.html; import java.net.URL; +import java.util.Locale; import org.apache.nutch.parse.HTMLMetaTags; import org.w3c.dom.*; @@ -64,7 +65,7 @@ public class HTMLMetaProcessor { // Retrieves name, http-equiv and content attribues for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); - String attrName = attr.getNodeName().toLowerCase(); + String attrName = attr.getNodeName().toLowerCase(Locale.ROOT); if (attrName.equals("name")) { nameNode = attr; } else if (attrName.equals("http-equiv")) { @@ -76,12 +77,12 @@ public class HTMLMetaProcessor { if (nameNode != null) { if (contentNode != null) { - String name = nameNode.getNodeValue().toLowerCase(); + String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT); metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); if ("robots".equals(name)) { if (contentNode != null) { - String directives = contentNode.getNodeValue().toLowerCase(); + String directives = contentNode.getNodeValue().toLowerCase(Locale.ROOT); int index = directives.indexOf("none"); if (index >= 0) { @@ -116,11 +117,11 @@ public class HTMLMetaProcessor { if (equivNode != null) { if (contentNode != null) { - String name = equivNode.getNodeValue().toLowerCase(); + String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT); String content = contentNode.getNodeValue(); metaTags.getHttpEquivTags().setProperty(name, content); if ("pragma".equals(name)) { - content = content.toLowerCase(); + content = content.toLowerCase(Locale.ROOT); int index = content.indexOf("no-cache"); if (index >= 0) metaTags.setNoCache(); @@ -140,7 +141,7 @@ public class HTMLMetaProcessor { } URL refreshUrl = null; if (metaTags.getRefresh() && idx != -1) { // set the URL - idx = content.toLowerCase().indexOf("url="); + idx = content.toLowerCase(Locale.ROOT).indexOf("url="); if (idx == -1) { // assume a mis-formatted entry with just the // url idx = content.indexOf(';') + 1; http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java index 5440ec7..3255dcc 100644 --- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java +++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java @@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.nutch.util.NutchConfiguration; import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; @@ -182,7 +183,7 @@ public class TestDOMContentUtils { DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); try { parser.parse( - new InputSource(new ByteArrayInputStream(testPages[i].getBytes())), + new InputSource(new ByteArrayInputStream(testPages[i].getBytes(StandardCharsets.UTF_8))), node); testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); } catch (Exception e) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java index 8c58ca4..f390041 100644 --- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java +++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java @@ -20,6 +20,7 @@ package org.apache.nutch.parse.html; import org.apache.nutch.parse.HTMLMetaTags; import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; import java.net.URL; import org.cyberneko.html.parsers.*; @@ -123,7 +124,7 @@ public class TestRobotsMetaProcessor { } for (int i = 0; i < tests.length; i++) { - byte[] bytes = tests[i].getBytes(); + byte[] bytes = tests[i].getBytes(StandardCharsets.UTF_8); DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java index fc2e930..a481755 100644 --- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java +++ b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java @@ -26,6 +26,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; +import java.util.Locale; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -150,7 +151,7 @@ public class JSParseFilter implements ParseFilter, Parser { links = getJSLinks(anode.getNodeValue(), "", base); } else if (anode.getNodeName().equalsIgnoreCase("href")) { String val = anode.getNodeValue(); - if (val != null && val.toLowerCase().indexOf("javascript:") != -1) { + if (val != null && val.toLowerCase(Locale.ROOT).indexOf("javascript:") != -1) { links = getJSLinks(val, "", base); } } @@ -178,7 +179,7 @@ public class JSParseFilter implements ParseFilter, Parser { public Parse getParse(String url, WebPage page) { String type = TableUtil.toString(page.getContentType()); if (type != null && !type.trim().equals("") - && !type.toLowerCase().startsWith("application/x-javascript")) + && !type.toLowerCase(Locale.ROOT).startsWith("application/x-javascript")) return ParseStatusUtils.getEmptyParse( ParseStatusCodes.FAILED_INVALID_FORMAT, "Content not JavaScript: '" + type + "'", getConf()); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java index 2aac3c6..f61838c 100644 --- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java +++ b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java @@ -27,6 +27,7 @@ import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.Map.Entry; +import java.nio.charset.StandardCharsets; import org.apache.avro.util.Utf8; import org.apache.commons.logging.Log; @@ -83,7 +84,7 @@ public class MetaTagsParser implements ParseFilter { LOG.debug("Found meta tag: " + lcMetatag + "\t" + value); } metadata.put(new Utf8(PARSE_META_PREFIX + lcMetatag), - ByteBuffer.wrap(value.getBytes())); + ByteBuffer.wrap(value.getBytes(StandardCharsets.UTF_8))); } } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java index 1b42263..a13eac7 100644 --- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java +++ b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetaTagsParser.java @@ -48,6 +48,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.nio.ByteBuffer; import java.util.Map; +import java.util.Locale; public class TestMetaTagsParser { @@ -129,7 +130,7 @@ public class TestMetaTagsParser { // Retrieves name, http-equiv and content attribues for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); - String attrName = attr.getNodeName().toLowerCase(); + String attrName = attr.getNodeName().toLowerCase(Locale.ROOT); if (attrName.equals("name")) { nameNode = attr; } else if (attrName.equals("http-equiv")) { @@ -140,14 +141,14 @@ public class TestMetaTagsParser { } if (nameNode != null) { if (contentNode != null) { - String name = nameNode.getNodeValue().toLowerCase(); + String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT); metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); } } if (equivNode != null) { if (contentNode != null) { - String name = equivNode.getNodeValue().toLowerCase(); + String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT); String content = contentNode.getNodeValue(); metaTags.getHttpEquivTags().setProperty(name, content); } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java index ae1cb44..ee95862 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMContentUtils.java @@ -22,6 +22,8 @@ import java.net.URL; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; +import java.util.Locale; +import java.nio.charset.StandardCharsets; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.parse.Outlink; @@ -321,7 +323,7 @@ public class DOMContentUtils { if (nodeType == Node.ELEMENT_NODE) { - nodeName = nodeName.toLowerCase(); + nodeName = nodeName.toLowerCase(Locale.ROOT); LinkParams params = (LinkParams) linkParams.get(nodeName); if (params != null) { if (!shouldThrowAwayLink(currentNode, children, childLen, params)) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java index 294bde9..0818eff 100644 --- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java +++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java @@ -18,6 +18,7 @@ package org.apache.nutch.parse.tika; import java.net.URL; +import java.util.Locale; import org.apache.nutch.parse.HTMLMetaTags; import org.w3c.dom.*; @@ -64,7 +65,7 @@ public class HTMLMetaProcessor { // Retrieves name, http-equiv and content attribues for (int i = 0; i < attrs.getLength(); i++) { Node attr = attrs.item(i); - String attrName = attr.getNodeName().toLowerCase(); + String attrName = attr.getNodeName().toLowerCase(Locale.ROOT); if (attrName.equals("name")) { nameNode = attr; } else if (attrName.equals("http-equiv")) { @@ -76,12 +77,12 @@ public class HTMLMetaProcessor { if (nameNode != null) { if (contentNode != null) { - String name = nameNode.getNodeValue().toLowerCase(); + String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT); metaTags.getGeneralTags().add(name, contentNode.getNodeValue()); if ("robots".equals(name)) { if (contentNode != null) { - String directives = contentNode.getNodeValue().toLowerCase(); + String directives = contentNode.getNodeValue().toLowerCase(Locale.ROOT); int index = directives.indexOf("none"); if (index >= 0) { @@ -116,11 +117,11 @@ public class HTMLMetaProcessor { if (equivNode != null) { if (contentNode != null) { - String name = equivNode.getNodeValue().toLowerCase(); + String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT); String content = contentNode.getNodeValue(); metaTags.getHttpEquivTags().setProperty(name, content); if ("pragma".equals(name)) { - content = content.toLowerCase(); + content = content.toLowerCase(Locale.ROOT); int index = content.indexOf("no-cache"); if (index >= 0) metaTags.setNoCache(); @@ -140,7 +141,7 @@ public class HTMLMetaProcessor { } URL refreshUrl = null; if (metaTags.getRefresh() && idx != -1) { // set the URL - idx = content.toLowerCase().indexOf("url="); + idx = content.toLowerCase(Locale.ROOT).indexOf("url="); if (idx == -1) { // assume a mis-formatted entry with just the // url idx = content.indexOf(';') + 1; http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java index 957d664..06bea9f 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java @@ -39,6 +39,7 @@ import org.slf4j.LoggerFactory; import org.w3c.dom.DocumentFragment; import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; import java.net.URL; import java.util.ArrayList; import java.util.StringTokenizer; @@ -210,7 +211,7 @@ public class DOMContentUtilsTest { // to add once available in Tika // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); try { - parser.parse(new ByteArrayInputStream(testPages[i].getBytes()), + parser.parse(new ByteArrayInputStream(testPages[i].getBytes(StandardCharsets.UTF_8)), domhandler, tikamd, context); testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); } catch (Exception e) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java ---------------------------------------------------------------------- diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java index 3a1204c..350be0e 100644 --- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java +++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java @@ -23,6 +23,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import org.apache.avro.util.Utf8; import org.apache.hadoop.conf.Configuration; @@ -72,14 +73,14 @@ public class TestImageMetadata { ByteBuffer bbufW = page.getMetadata().get(new Utf8("width")); byte[] byteArrayW = new byte[bbufW.remaining()]; bbufW.get(byteArrayW); - String width = new String(byteArrayW); + String width = new String(byteArrayW, StandardCharsets.UTF_8); assertEquals("121", width); // assert height ByteBuffer bbufH = page.getMetadata().get(new Utf8("height")); byte[] byteArrayH = new byte[bbufH.remaining()]; bbufH.get(byteArrayH); - String height = new String(byteArrayH); + String height = new String(byteArrayH, StandardCharsets.UTF_8); assertEquals("48", height); } } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java index e7139cc..0695439 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java @@ -28,6 +28,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.util.Collection; import java.util.HashSet; @@ -218,7 +219,7 @@ public class File implements Protocol { } if (dumpContent) { - System.out.print(new String(content.getContent())); + System.out.print(new String(content.getContent(), StandardCharsets.UTF_8)); } file = null; http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java index 410f8e6..0e5f2b0 100644 --- a/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java +++ b/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java @@ -20,6 +20,7 @@ package org.apache.nutch.protocol.file; // JDK imports import java.io.IOException; import java.io.UnsupportedEncodingException; +import java.nio.charset.StandardCharsets; import java.net.URL; import org.apache.hadoop.conf.Configuration; @@ -275,7 +276,7 @@ public class FileResponse { x.append("</pre></body></html>\n"); - return new String(x).getBytes(); + return new String(x).getBytes(StandardCharsets.UTF_8); } } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java index 2478b0a..ffa2091 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; +import java.nio.charset.StandardCharsets; import java.net.InetAddress; import java.net.Socket; import java.util.List; @@ -330,7 +331,7 @@ public class Client extends FTP { + ((path == null) ? "" : path)); BufferedReader reader = new BufferedReader(new InputStreamReader( - socket.getInputStream())); + socket.getInputStream(), StandardCharsets.UTF_8)); // force-close data channel socket, when download limit is reached // boolean mandatory_close = false; http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java index ccfae0a..3f3a7e8 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java @@ -31,6 +31,7 @@ import java.io.IOException; import java.net.URL; import java.util.Collection; import java.util.HashSet; +import java.nio.charset.StandardCharsets; /** * This class is a protocol plugin used for ftp: scheme. It creates @@ -243,7 +244,7 @@ public class Ftp implements Protocol { System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED)); if (dumpContent) { - System.out.print(new String(content.getContent())); + System.out.print(new String(content.getContent(), StandardCharsets.UTF_8)); } ftp = null;