Repository: nutch Updated Branches: refs/heads/2.x 75d846cf3 -> 32d1486df
http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java index 0db617c..d7acfdc 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java @@ -35,6 +35,7 @@ import java.io.IOException; import java.net.InetAddress; import java.net.URL; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.LinkedList; import java.util.List; @@ -114,7 +115,7 @@ public class FtpResponse { if (addr != null && conf.getBoolean("store.ip.address", false) == true) { String ipString = addr.getHostAddress(); // get the ip address page.getMetadata().put(new Utf8("_ip_"), - ByteBuffer.wrap(ipString.getBytes())); + ByteBuffer.wrap(ipString.getBytes(StandardCharsets.UTF_8))); } // idled too long, remote server or ourselves may have timed out, @@ -521,7 +522,7 @@ public class FtpResponse { x.append("</pre></body></html>\n"); - return new String(x).getBytes(); + return new String(x).getBytes(StandardCharsets.UTF_8); } } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java index cc039bd..4ce9a83 100644 --- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java +++ b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java @@ -29,6 +29,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URL; +import java.util.Locale; /** * This class is used for parsing robots for urls belonging to FTP protocol. It @@ -63,9 +64,9 @@ public class FtpRobotRulesParser extends RobotRulesParser { */ public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) { - String protocol = url.getProtocol().toLowerCase(); // normalize to lower + String protocol = url.getProtocol().toLowerCase(Locale.ROOT); // normalize to lower // case - String host = url.getHost().toLowerCase(); // normalize to lower case + String host = url.getHost().toLowerCase(Locale.ROOT); // normalize to lower case BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + host); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java index 773958a..965eaac 100644 --- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java +++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java @@ -36,6 +36,7 @@ import javax.net.ssl.SSLSocket; import javax.net.ssl.SSLSocketFactory; import java.net.URL; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.HashSet; import java.util.Set; @@ -138,7 +139,7 @@ public class HttpResponse implements Response { String ipString = sockAddr.getAddress().getHostAddress(); // get the ip // address page.getMetadata().put(new Utf8("_ip_"), - ByteBuffer.wrap(ipString.getBytes())); + ByteBuffer.wrap(ipString.getBytes(StandardCharsets.UTF_8))); } // make request @@ -182,7 +183,7 @@ public class HttpResponse implements Response { // } reqStr.append("\r\n"); - byte[] reqBytes = reqStr.toString().getBytes(); + byte[] reqBytes = reqStr.toString().getBytes(StandardCharsets.UTF_8); req.write(reqBytes); req.flush(); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java index 094abba..a15f91b 100644 --- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java +++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpBasicAuthentication.java @@ -23,6 +23,7 @@ import java.util.Map; import java.util.TreeMap; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.nio.charset.StandardCharsets; // Commons Codec imports import org.apache.commons.codec.binary.Base64; @@ -92,9 +93,9 @@ public class HttpBasicAuthentication implements HttpAuthentication, + " is null"); } - byte[] credBytes = (username + ":" + password).getBytes(); + byte[] credBytes = (username + ":" + password).getBytes(StandardCharsets.UTF_8); credentials.add("Authorization: Basic " - + new String(Base64.encodeBase64(credBytes))); + + new String(Base64.encodeBase64(credBytes), StandardCharsets.UTF_8)); if (LOG.isTraceEnabled()) { LOG.trace("Basic credentials: " + credentials); } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java ---------------------------------------------------------------------- diff --git a/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java b/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java index 1545aa2..2dbdd48 100644 --- a/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java +++ b/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java @@ -20,6 +20,7 @@ package org.apache.nutch.protocol.sftp; //JDK imports import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.net.MalformedURLException; import java.net.URL; import java.util.Collection; @@ -222,7 +223,7 @@ public class Sftp implements Protocol { metadata.set(Response.LOCATION, url.toExternalForm()); Content content = new Content(url.toExternalForm(), url.toExternalForm(), - directoryList.getBytes(), "text/html", metadata, configuration); + directoryList.getBytes(StandardCharsets.UTF_8), "text/html", metadata, configuration); ProtocolOutput po = new ProtocolOutput(content); return po; } catch (SftpException e) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java b/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java index 587a218..00e6a6a 100644 --- a/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java +++ b/src/plugin/scoring-opic/src/test/org/apache/nutch/scoring/opic/TestOPICScoringFilter.java @@ -24,6 +24,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; +import java.util.Locale; import org.apache.hadoop.conf.Configuration; import org.apache.nutch.scoring.ScoreDatum; @@ -32,6 +33,7 @@ import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.TableUtil; import java.text.DecimalFormat; +import java.text.NumberFormat; import org.junit.Before; import org.junit.Test; @@ -55,7 +57,11 @@ public class TestOPICScoringFilter { private final List<ScoreDatum> outlinkedScoreData = new ArrayList<ScoreDatum>(); private static final int DEPTH = 3; - DecimalFormat df = new DecimalFormat("#.###"); + DecimalFormat df = (DecimalFormat) NumberFormat.getNumberInstance(Locale.ROOT); + + { + df.applyPattern("#.###"); + } private final String[] seedList = new String[] { "http://a.com", "http://b.com", "http://c.com", }; http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java ---------------------------------------------------------------------- diff --git a/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java b/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java index 5162a9b..8b81023 100644 --- a/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java +++ b/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java @@ -19,6 +19,7 @@ package org.apache.nutch.collection; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.util.Collection; +import java.nio.charset.StandardCharsets; import org.apache.nutch.util.NutchConfiguration; @@ -69,7 +70,7 @@ public class TestSubcollection { xml.append("</subcollection>"); xml.append("</subcollections>"); - InputStream is = new ByteArrayInputStream(xml.toString().getBytes()); + InputStream is = new ByteArrayInputStream(xml.toString().getBytes(StandardCharsets.UTF_8)); CollectionManager cm = new CollectionManager(); cm.parse(is); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java index 6d94391..bf1ef42 100644 --- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java +++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java @@ -21,8 +21,12 @@ import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; +import java.io.FileInputStream; +import java.io.InputStreamReader; import java.util.LinkedHashSet; import java.util.Set; +import java.util.Locale; +import java.nio.charset.StandardCharsets; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; @@ -167,7 +171,7 @@ public class DomainURLFilter implements URLFilter { } try { if (reader == null) { - reader = new FileReader(file); + reader = new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8); } readConfiguration(reader); } catch (IOException e) { @@ -185,7 +189,7 @@ public class DomainURLFilter implements URLFilter { // match for suffix, domain, and host in that order. more general will // override more specific - String domain = URLUtil.getDomainName(url).toLowerCase().trim(); + String domain = URLUtil.getDomainName(url).toLowerCase(Locale.ROOT).trim(); String host = URLUtil.getHost(url); String suffix = null; DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java index 0bc3ca1..366c11e 100644 --- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java +++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java @@ -36,6 +36,7 @@ import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.IOException; import java.io.StringReader; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.ArrayList; @@ -109,7 +110,7 @@ public class PrefixURLFilter implements URLFilter { else filter = new PrefixURLFilter(); - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); String line; while ((line = in.readLine()) != null) { String out = filter.filter(line); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java index 6c14e63..1a7492a 100644 --- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java +++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java @@ -33,11 +33,14 @@ import java.io.Reader; import java.io.FileReader; import java.io.BufferedReader; import java.io.InputStreamReader; +import java.io.FileInputStream; import java.io.IOException; import java.io.StringReader; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.ArrayList; +import java.util.Locale; import java.net.URL; import java.net.MalformedURLException; @@ -151,7 +154,7 @@ public class SuffixURLFilter implements URLFilter { return null; String _url; if (ignoreCase) - _url = url.toLowerCase(); + _url = url.toLowerCase(Locale.ROOT); else _url = url; if (filterFromPath) { @@ -225,7 +228,7 @@ public class SuffixURLFilter implements URLFilter { } if (ignore) { for (int i = 0; i < aSuffixes.size(); i++) { - aSuffixes.set(i, ((String) aSuffixes.get(i)).toLowerCase()); + aSuffixes.set(i, ((String) aSuffixes.get(i)).toLowerCase(Locale.ROOT)); } } suffixes = new SuffixStringMatcher(aSuffixes); @@ -236,14 +239,14 @@ public class SuffixURLFilter implements URLFilter { public static void main(String args[]) throws IOException { SuffixURLFilter filter; - if (args.length >= 1) - filter = new SuffixURLFilter(new FileReader(args[0])); - else { + if (args.length >= 1) { + filter = new SuffixURLFilter(new InputStreamReader(new FileInputStream(args[0]), StandardCharsets.UTF_8)); + } else { filter = new SuffixURLFilter(); filter.setConf(NutchConfiguration.create()); } - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); String line; while ((line = in.readLine()) != null) { String out = filter.filter(line); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java index 0fae8da..3652d47 100644 --- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java +++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java @@ -20,16 +20,12 @@ package org.apache.nutch.net.urlnormalizer.basic; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; import java.net.URISyntaxException; import java.net.URL; import java.net.MalformedURLException; import java.util.regex.Pattern; - - - - - - +import java.util.Locale; // Commons Logging imports import org.slf4j.Logger; @@ -83,7 +79,7 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { || "ftp".equals(protocol)) { if (host != null) { - String newHost = host.toLowerCase(); // lowercase host + String newHost = host.toLowerCase(Locale.ROOT); // lowercase host if (!host.equals(newHost)) { host = newHost; changed = true; @@ -161,7 +157,7 @@ public class BasicURLNormalizer extends Configured implements URLNormalizer { System.out.println("Scope: " + scope); } String line, normUrl; - BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); while ((line = in.readLine()) != null) { try { normUrl = normalizer.normalize(line, scope); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java index 363da18..d460d9e 100644 --- a/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java +++ b/src/plugin/urlnormalizer-regex/src/java/org/apache/nutch/net/urlnormalizer/regex/RegexURLNormalizer.java @@ -21,6 +21,9 @@ import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.io.StringReader; +import java.io.InputStreamReader; +import java.io.FileInputStream; +import java.nio.charset.StandardCharsets; import java.net.MalformedURLException; import java.util.ArrayList; import java.util.Collections; @@ -205,8 +208,8 @@ public class RegexURLNormalizer extends Configured implements URLNormalizer { LOG.info("loading " + filename); } try { - FileReader reader = new FileReader(filename); - return readConfiguration(reader); + InputStreamReader isr = new InputStreamReader(new FileInputStream(filename), StandardCharsets.UTF_8); + return readConfiguration(isr); } catch (Exception e) { LOG.error("Error loading rules from '" + filename + "': " + e); return EMPTY_RULES; http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java ---------------------------------------------------------------------- diff --git a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java index 92fd630..128ecdc 100644 --- a/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java +++ b/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java @@ -24,6 +24,7 @@ import java.io.FileInputStream; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -68,10 +69,10 @@ public class TestRegexURLNormalizer { }); for (int i = 0; i < configs.length; i++) { try { - FileReader reader = new FileReader(configs[i]); + InputStreamReader isr = new InputStreamReader(new FileInputStream(configs[i]), StandardCharsets.UTF_8);; String cname = configs[i].getName(); cname = cname.substring(16, cname.indexOf(".xml")); - normalizer.setConfiguration(reader, cname); + normalizer.setConfiguration(isr, cname); NormalizedURL[] urls = readTestFile(cname); testData.put(cname, urls); } catch (Exception e) { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/test/org/apache/nutch/parse/TestSitemapParser.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/parse/TestSitemapParser.java b/src/test/org/apache/nutch/parse/TestSitemapParser.java index 5c9ea83..7521775 100644 --- a/src/test/org/apache/nutch/parse/TestSitemapParser.java +++ b/src/test/org/apache/nutch/parse/TestSitemapParser.java @@ -17,31 +17,17 @@ package org.apache.nutch.parse; -import org.apache.avro.util.Utf8; -import org.apache.commons.io.IOUtils; -import org.apache.nutch.crawl.InjectType; -import org.apache.nutch.metadata.Metadata; -import org.apache.nutch.net.URLFilterException; -import org.apache.nutch.net.URLNormalizers; -import org.apache.nutch.storage.Mark; import org.apache.nutch.storage.ParseStatus; import org.apache.nutch.storage.WebPage; import org.apache.nutch.util.AbstractNutchTest; -import org.apache.nutch.util.TableUtil; import org.junit.After; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import org.mockito.Mock; -import org.mockito.stubbing.Answer; -import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URL; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; import java.util.HashMap; -import java.util.Map; -import java.util.Set; import static org.junit.Assert.*; import static org.mockito.Mockito.mock; @@ -77,7 +63,7 @@ public class TestSitemapParser extends AbstractNutchTest { + "<url>\n\t\t<loc>http://localhost/zzz4.html</loc>\n\t\t<lastmod>2015-06-10</lastmod>\n\t\t<changefreq>monthly</changefreq>\n\t\t<priority>0.8</priority>\n\t</url>\n\t" + "<url>\n\t\t<loc>http://localhost/zzz5.html</loc>\n\t\t<lastmod>2015-06-10</lastmod>\n\t\t<changefreq>monthly</changefreq>\n\t\t<priority>0.8</priority>\n\t</url>\n" + "</urlset>"; - when(page.getContent()).thenReturn(ByteBuffer.wrap(content.getBytes())); + when(page.getContent()).thenReturn(ByteBuffer.wrap(content.getBytes(StandardCharsets.UTF_8))); when(page.getContentType()).thenReturn("application/xml"); NutchSitemapParser sParser = new NutchSitemapParser(); @@ -110,7 +96,7 @@ public class TestSitemapParser extends AbstractNutchTest { + " </sitemap>\n" + "</sitemapindex>"; - when(page.getContent()).thenReturn(ByteBuffer.wrap(content.getBytes())); + when(page.getContent()).thenReturn(ByteBuffer.wrap(content.getBytes(StandardCharsets.UTF_8))); when(page.getContentType()).thenReturn("application/xml"); when(page.getSitemaps()).thenReturn(new HashMap<CharSequence, CharSequence>()); http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/test/org/apache/nutch/plugin/TestPluginSystem.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/plugin/TestPluginSystem.java b/src/test/org/apache/nutch/plugin/TestPluginSystem.java index 3ff20ad..96115f5 100644 --- a/src/test/org/apache/nutch/plugin/TestPluginSystem.java +++ b/src/test/org/apache/nutch/plugin/TestPluginSystem.java @@ -20,8 +20,9 @@ package org.apache.nutch.plugin; import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; -import java.io.FileWriter; import java.io.IOException; +import java.io.OutputStreamWriter; +import java.nio.charset.StandardCharsets; import java.util.LinkedList; import java.util.Locale; import java.util.Properties; @@ -259,7 +260,8 @@ public class TestPluginSystem { */ private void createPluginManifest(int i, String pFolderPath) throws IOException { - FileWriter out = new FileWriter(pFolderPath + File.separator + "plugin.xml"); + OutputStreamWriter osw = new OutputStreamWriter( + new FileOutputStream(pFolderPath + File.separator + "plugin.xml"), StandardCharsets.UTF_8); String xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<!--this is just a simple plugin for testing issues.-->" + "<plugin id=\"org.apache.nutch.plugin." @@ -278,9 +280,9 @@ public class TestPluginSystem { + "id=\"aExtensionId.\" class=\"org.apache.nutch.plugin.HelloWorldExtension\">" + "<parameter name=\"dummy-name\" value=\"a simple param value\"/>" + "</implementation></extension></plugin>"; - out.write(xml); - out.flush(); - out.close(); + osw.write(xml); + osw.flush(); + osw.close(); } private String getParameterValue() { http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/test/org/apache/nutch/util/TestEncodingDetector.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/util/TestEncodingDetector.java b/src/test/org/apache/nutch/util/TestEncodingDetector.java index 776573c..3f768fd 100644 --- a/src/test/org/apache/nutch/util/TestEncodingDetector.java +++ b/src/test/org/apache/nutch/util/TestEncodingDetector.java @@ -24,6 +24,8 @@ import org.junit.Test; import java.io.UnsupportedEncodingException; import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Locale; import static org.junit.Assert.assertEquals; @@ -59,7 +61,7 @@ public class TestEncodingDetector { detector.autoDetectClues(page, true); encoding = detector.guessEncoding(page, "windows-1252"); // no information is available, so it should return default encoding - assertEquals("windows-1252", encoding.toLowerCase()); + assertEquals("windows-1252", encoding.toLowerCase(Locale.ROOT)); page = WebPage.newBuilder().build(); page.setBaseUrl(new Utf8("http://www.example.com/")); @@ -71,7 +73,7 @@ public class TestEncodingDetector { detector = new EncodingDetector(conf); detector.autoDetectClues(page, true); encoding = detector.guessEncoding(page, "windows-1252"); - assertEquals("utf-16", encoding.toLowerCase()); + assertEquals("utf-16", encoding.toLowerCase(Locale.ROOT)); page = WebPage.newBuilder().build(); page.setBaseUrl(new Utf8("http://www.example.com/")); @@ -82,7 +84,7 @@ public class TestEncodingDetector { detector.autoDetectClues(page, true); detector.addClue("windows-1254", "sniffed"); encoding = detector.guessEncoding(page, "windows-1252"); - assertEquals("windows-1254", encoding.toLowerCase()); + assertEquals("windows-1254", encoding.toLowerCase(Locale.ROOT)); // enable autodetection conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, 50); @@ -91,13 +93,13 @@ public class TestEncodingDetector { page.setContentType(new Utf8("text/plain")); page.setContent(ByteBuffer.wrap(contentInOctets)); page.getMetadata().put(new Utf8(Response.CONTENT_TYPE), - ByteBuffer.wrap("text/plain; charset=UTF-16".getBytes())); + ByteBuffer.wrap("text/plain; charset=UTF-16".getBytes(StandardCharsets.UTF_8))); detector = new EncodingDetector(conf); detector.autoDetectClues(page, true); detector.addClue("utf-32", "sniffed"); encoding = detector.guessEncoding(page, "windows-1252"); - assertEquals("utf-8", encoding.toLowerCase()); + assertEquals("utf-8", encoding.toLowerCase(Locale.ROOT)); } } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/test/org/apache/nutch/util/TestGZIPUtils.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/util/TestGZIPUtils.java b/src/test/org/apache/nutch/util/TestGZIPUtils.java index 50fda7e..878442e 100644 --- a/src/test/org/apache/nutch/util/TestGZIPUtils.java +++ b/src/test/org/apache/nutch/util/TestGZIPUtils.java @@ -21,6 +21,7 @@ import org.junit.Test; import static org.junit.Assert.*; import java.io.IOException; +import java.nio.charset.StandardCharsets; /** Unit tests for GZIPUtils methods. */ public class TestGZIPUtils { @@ -115,41 +116,41 @@ public class TestGZIPUtils { @Test public void testZipUnzip() { - byte[] testBytes = SHORT_TEST_STRING.getBytes(); + byte[] testBytes = SHORT_TEST_STRING.getBytes(StandardCharsets.UTF_8); testZipUnzip(testBytes); - testBytes = LONGER_TEST_STRING.getBytes(); + testBytes = LONGER_TEST_STRING.getBytes(StandardCharsets.UTF_8); testZipUnzip(testBytes); - testBytes = WEBPAGE.getBytes(); + testBytes = WEBPAGE.getBytes(StandardCharsets.UTF_8); testZipUnzip(testBytes); } @Test public void testZipUnzipBestEffort() { - byte[] testBytes = SHORT_TEST_STRING.getBytes(); + byte[] testBytes = SHORT_TEST_STRING.getBytes(StandardCharsets.UTF_8); testZipUnzipBestEffort(testBytes); - testBytes = LONGER_TEST_STRING.getBytes(); + testBytes = LONGER_TEST_STRING.getBytes(StandardCharsets.UTF_8); testZipUnzipBestEffort(testBytes); - testBytes = WEBPAGE.getBytes(); + testBytes = WEBPAGE.getBytes(StandardCharsets.UTF_8); testZipUnzipBestEffort(testBytes); } @Test public void testTruncation() { - byte[] testBytes = SHORT_TEST_STRING.getBytes(); + byte[] testBytes = SHORT_TEST_STRING.getBytes(StandardCharsets.UTF_8); testTruncation(testBytes); - testBytes = LONGER_TEST_STRING.getBytes(); + testBytes = LONGER_TEST_STRING.getBytes(StandardCharsets.UTF_8); testTruncation(testBytes); - testBytes = WEBPAGE.getBytes(); + testBytes = WEBPAGE.getBytes(StandardCharsets.UTF_8); testTruncation(testBytes); } @Test public void testLimit() { - byte[] testBytes = SHORT_TEST_STRING.getBytes(); + byte[] testBytes = SHORT_TEST_STRING.getBytes(StandardCharsets.UTF_8); testLimit(testBytes); - testBytes = LONGER_TEST_STRING.getBytes(); + testBytes = LONGER_TEST_STRING.getBytes(StandardCharsets.UTF_8); testLimit(testBytes); - testBytes = WEBPAGE.getBytes(); + testBytes = WEBPAGE.getBytes(StandardCharsets.UTF_8); testLimit(testBytes); } http://git-wip-us.apache.org/repos/asf/nutch/blob/a671540a/src/test/org/apache/nutch/util/TestNodeWalker.java ---------------------------------------------------------------------- diff --git a/src/test/org/apache/nutch/util/TestNodeWalker.java b/src/test/org/apache/nutch/util/TestNodeWalker.java index db6db91..ec3669e 100644 --- a/src/test/org/apache/nutch/util/TestNodeWalker.java +++ b/src/test/org/apache/nutch/util/TestNodeWalker.java @@ -18,6 +18,7 @@ package org.apache.nutch.util; import java.io.ByteArrayInputStream; +import java.nio.charset.StandardCharsets; import org.junit.Before; import org.junit.Test; @@ -60,7 +61,7 @@ public class TestNodeWalker { "http://apache.org/xml/features/nonvalidating/load-external-dtd", false); parser - .parse(new InputSource(new ByteArrayInputStream(WEBPAGE.getBytes()))); + .parse(new InputSource(new ByteArrayInputStream(WEBPAGE.getBytes(StandardCharsets.UTF_8)))); } catch (Exception e) { e.printStackTrace(); }