This is an automated email from the ASF dual-hosted git repository.

jnioche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-stormcrawler.git


The following commit(s) were added to refs/heads/main by this push:
     new c1088fb3 #1207 -- add forbidden-apis (#1208)
c1088fb3 is described below

commit c1088fb3ff3ca9ca99bcce108d8bb2b40b97c094
Author: Tim Allison <[email protected]>
AuthorDate: Fri May 3 11:54:07 2024 -0400

    #1207 -- add forbidden-apis (#1208)
    
    * Add forbidden-apis -- this just adds the plugin. I'll update the repo to 
pass it in follow-on commits. This is just a WIP.
    
    * Add forbidden-apis -- stormcrawler-core
    
    * Add forbidden-apis -- rest of the project
---
 .../apache/stormcrawler/bolt/FeedParserBolt.java   |  8 ++++--
 .../apache/stormcrawler/bolt/JSoupParserBolt.java  |  3 ++-
 .../stormcrawler/bolt/SiteMapParserBolt.java       | 10 ++++---
 .../filtering/regex/FastURLFilter.java             |  5 ++--
 .../filtering/regex/RegexURLNormalizer.java        |  6 +++--
 .../apache/stormcrawler/parse/JSoupFilters.java    |  3 ++-
 .../apache/stormcrawler/parse/ParseFilters.java    |  3 ++-
 .../apache/stormcrawler/parse/TextExtractor.java   |  3 ++-
 .../parse/filter/MimeTypeNormalization.java        | 17 ++++++------
 .../persistence/AdaptiveScheduler.java             |  3 ++-
 .../stormcrawler/persistence/DefaultScheduler.java |  2 +-
 .../stormcrawler/protocol/RobotRulesParser.java    |  9 ++++---
 .../stormcrawler/protocol/file/FileResponse.java   |  3 ++-
 .../protocol/httpclient/HttpProtocol.java          |  5 +++-
 .../stormcrawler/protocol/okhttp/HttpProtocol.java | 31 +++++++++++++++++-----
 .../protocol/selenium/SeleniumProtocol.java        |  3 ++-
 .../stormcrawler/proxy/MultiProxyManager.java      |  8 +++---
 .../stormcrawler/proxy/SingleProxyManager.java     |  7 +++--
 .../org/apache/stormcrawler/spout/FileSpout.java   |  2 +-
 .../apache/stormcrawler/util/CookieConverter.java  |  6 ++---
 .../apache/stormcrawler/util/PerSecondReducer.java |  2 +-
 .../stormcrawler/bolt/JSoupParserBoltTest.java     |  3 ++-
 .../filtering/BasicURLNormalizerTest.java          |  5 ++--
 .../apache/stormcrawler/indexer/IndexerTester.java |  4 ++-
 .../persistence/AdaptiveSchedulerTest.java         | 18 ++++++++++---
 .../persistence/DefaultSchedulerTest.java          |  8 +++---
 .../stormcrawler/proxy/MultiProxyManagerTest.java  |  3 ++-
 .../aws/bolt/CloudSearchIndexerBolt.java           |  3 ++-
 .../stormcrawler/aws/bolt/CloudSearchUtils.java    |  3 ++-
 .../opensearch/metrics/MetricsConsumer.java        |  3 ++-
 .../opensearch/persistence/AggregationSpout.java   |  9 ++++---
 .../opensearch/persistence/StatusUpdaterBolt.java  | 16 ++++++++---
 .../stormcrawler/solr/metrics/MetricsConsumer.java |  3 ++-
 .../solr/persistence/StatusUpdaterBolt.java        |  3 ++-
 .../java/org/apache/stormcrawler/sql/SQLSpout.java |  4 ++-
 .../stormcrawler/warc/WARCFileNameFormat.java      |  7 ++---
 .../apache/stormcrawler/warc/WARCRecordFormat.java |  2 +-
 .../stormcrawler/warc/WARCRequestRecordFormat.java |  2 +-
 pom.xml                                            | 31 ++++++++++++++++++++--
 39 files changed, 187 insertions(+), 79 deletions(-)

diff --git 
a/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java 
b/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java
index 112ac816..d1dde17c 100644
--- a/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java
@@ -25,12 +25,15 @@ import com.rometools.rome.feed.synd.SyndFeed;
 import com.rometools.rome.io.SyndFeedInput;
 import java.io.ByteArrayInputStream;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Calendar;
 import java.util.Date;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
+import java.util.TimeZone;
 import org.apache.commons.lang.StringUtils;
 import org.apache.storm.task.OutputCollector;
 import org.apache.storm.task.TopologyContext;
@@ -85,7 +88,7 @@ public class FeedParserBolt extends StatusEmitterBolt {
                     isfeed = true;
                 } else {
                     // try based on the first bytes?
-                    byte[] clue = "<rss ".getBytes();
+                    byte[] clue = "<rss ".getBytes(StandardCharsets.UTF_8);
                     byte[] beginning = content;
                     final int maxOffsetGuess = 100;
                     if (content.length > maxOffsetGuess) {
@@ -195,7 +198,8 @@ public class FeedParserBolt extends StatusEmitterBolt {
             if (publishedDate != null) {
                 // filter based on the published date
                 if (filterHoursSincePub != -1) {
-                    Calendar rightNow = Calendar.getInstance();
+                    Calendar rightNow =
+                            Calendar.getInstance(TimeZone.getTimeZone("GMT"), 
Locale.ROOT);
                     rightNow.add(Calendar.HOUR, -filterHoursSincePub);
                     if (publishedDate.before(rightNow.getTime())) {
                         LOG.info(
diff --git 
a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java 
b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
index 67aa673e..44e994ef 100644
--- a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
@@ -28,6 +28,7 @@ import java.nio.charset.Charset;
 import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import org.apache.commons.lang.StringUtils;
 import org.apache.storm.metric.api.MultiCountMetric;
@@ -190,7 +191,7 @@ public class JSoupParserBolt extends StatusEmitterBolt {
         }
 
         if (StringUtils.isNotBlank(mimeType)) {
-            if (mimeType.toLowerCase().contains("html")) {
+            if (mimeType.toLowerCase(Locale.ROOT).contains("html")) {
                 CT_OK = true;
             }
         }
diff --git 
a/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java 
b/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java
index 22fbb20d..7e2c88f0 100644
--- a/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java
@@ -31,6 +31,7 @@ import crawlercommons.sitemaps.extension.Extension;
 import crawlercommons.sitemaps.extension.ExtensionMetadata;
 import java.io.IOException;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Calendar;
@@ -38,7 +39,9 @@ import java.util.Collection;
 import java.util.Date;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
+import java.util.TimeZone;
 import org.apache.commons.lang.StringUtils;
 import org.apache.storm.metric.api.MeanReducer;
 import org.apache.storm.metric.api.ReducedMetric;
@@ -73,7 +76,7 @@ public class SiteMapParserBolt extends StatusEmitterBolt {
 
     private static final org.slf4j.Logger LOG = 
LoggerFactory.getLogger(SiteMapParserBolt.class);
 
-    private static final byte[] clue = Namespace.SITEMAP.getBytes();
+    private static final byte[] clue = 
Namespace.SITEMAP.getBytes(StandardCharsets.UTF_8);
 
     private SiteMapParser parser;
 
@@ -198,7 +201,7 @@ public class SiteMapParserBolt extends StatusEmitterBolt {
             SiteMapIndex smi = (SiteMapIndex) siteMap;
             Collection<AbstractSiteMap> subsitemaps = smi.getSitemaps();
 
-            Calendar rightNow = Calendar.getInstance();
+            Calendar rightNow = 
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
             rightNow.add(Calendar.HOUR, -filterHoursSinceModified);
 
             int delay = 0;
@@ -274,7 +277,8 @@ public class SiteMapParserBolt extends StatusEmitterBolt {
                 if (lastModified != null) {
                     // filter based on the published date
                     if (filterHoursSinceModified != -1) {
-                        Calendar rightNow = Calendar.getInstance();
+                        Calendar rightNow =
+                                
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
                         rightNow.add(Calendar.HOUR, -filterHoursSinceModified);
                         if (lastModified.before(rightNow.getTime())) {
                             LOG.info(
diff --git 
a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java 
b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
index 43077b8b..671b9c0f 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
+++ 
b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
@@ -29,6 +29,7 @@ import java.util.HashMap;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.regex.Pattern;
 import org.apache.stormcrawler.JSONResource;
@@ -327,10 +328,10 @@ class Rule {
 
     public Rule(String line) {
         int offset = 0;
-        String lcline = line.toLowerCase();
+        String lcline = line.toLowerCase(Locale.ROOT);
         // separate the type from the pattern
         for (Type t : Type.values()) {
-            String start = t.toString().toLowerCase() + " ";
+            String start = t.toString().toLowerCase(Locale.ROOT) + " ";
             if (lcline.startsWith(start)) {
                 type = t;
                 offset = start.length();
diff --git 
a/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLNormalizer.java
 
b/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLNormalizer.java
index 31aa6ac3..4679f8e1 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLNormalizer.java
+++ 
b/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLNormalizer.java
@@ -20,6 +20,7 @@ import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.node.ArrayNode;
 import java.io.FileNotFoundException;
 import java.io.FileReader;
+import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
@@ -236,9 +237,10 @@ public class RegexURLNormalizer extends URLFilter {
      * Utility method to test rules against an input. the first arg is the 
absolute path of the
      * rules file, the second is the URL to be normalised
      */
-    public static void main(String[] args) throws FileNotFoundException {
+    public static void main(String[] args) throws FileNotFoundException, 
IOException {
         RegexURLNormalizer normalizer = new RegexURLNormalizer();
-        normalizer.rules = normalizer.readConfiguration(new 
FileReader(args[0]));
+        normalizer.rules =
+                normalizer.readConfiguration(new FileReader(args[0], 
StandardCharsets.UTF_8));
 
         String output = normalizer.filter(null, null, args[1]);
 
diff --git a/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java 
b/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java
index 53ca12f2..91474006 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java
@@ -21,6 +21,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.Map;
 import org.apache.commons.cli.CommandLine;
@@ -154,7 +155,7 @@ public class JSoupFilters extends AbstractConfigurable 
implements JSoupFilter, J
 
         byte[] content = IOUtils.toByteArray((new URL(url)).openStream());
 
-        Document doc = Jsoup.parse(new String(content), url);
+        Document doc = Jsoup.parse(new String(content, 
StandardCharsets.UTF_8), url);
 
         filters.filter(url, content, doc, parse);
 
diff --git a/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java 
b/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java
index 83c48afe..e2a0723e 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java
@@ -23,6 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.List;
 import java.util.Map;
 import org.apache.commons.cli.CommandLine;
@@ -180,7 +181,7 @@ public class ParseFilters extends ParseFilter implements 
JSONResource {
 
         byte[] content = IOUtils.toByteArray((new URL(url)).openStream());
 
-        Document doc = Jsoup.parse(new String(content), url);
+        Document doc = Jsoup.parse(new String(content, 
StandardCharsets.UTF_8), url);
 
         filters.filter(url, content, DocumentFragmentBuilder.fromJsoup(doc), 
parse);
 
diff --git 
a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java 
b/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
index 7f79a94d..99fdd16c 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
@@ -18,6 +18,7 @@ package org.apache.stormcrawler.parse;
 
 import java.util.HashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import org.apache.stormcrawler.util.ConfUtils;
 import org.jetbrains.annotations.Contract;
@@ -74,7 +75,7 @@ public class TextExtractor {
         inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME, 
stormConf);
         excludedTags = new HashSet<String>();
         ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf)
-                .forEach((s) -> excludedTags.add(s.toLowerCase()));
+                .forEach((s) -> excludedTags.add(s.toLowerCase(Locale.ROOT)));
     }
 
     public String text(Element element) {
diff --git 
a/core/src/main/java/org/apache/stormcrawler/parse/filter/MimeTypeNormalization.java
 
b/core/src/main/java/org/apache/stormcrawler/parse/filter/MimeTypeNormalization.java
index 6784398e..3f83da2f 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/parse/filter/MimeTypeNormalization.java
+++ 
b/core/src/main/java/org/apache/stormcrawler/parse/filter/MimeTypeNormalization.java
@@ -16,6 +16,7 @@
  */
 package org.apache.stormcrawler.parse.filter;
 
+import java.util.Locale;
 import org.apache.storm.shade.org.apache.commons.lang.StringUtils;
 import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.parse.ParseFilter;
@@ -36,21 +37,21 @@ public class MimeTypeNormalization extends ParseFilter {
         String ct = m.getFirstValue("parse.Content-Type");
         if (StringUtils.isBlank(ct)) {
             ct = "unknown";
-        } else if (ct.toLowerCase().contains("html")) {
+        } else if (ct.toLowerCase(Locale.ROOT).contains("html")) {
             ct = "html";
-        } else if (ct.toLowerCase().contains("pdf")) {
+        } else if (ct.toLowerCase(Locale.ROOT).contains("pdf")) {
             ct = "pdf";
-        } else if (ct.toLowerCase().contains("word")) {
+        } else if (ct.toLowerCase(Locale.ROOT).contains("word")) {
             ct = "word";
-        } else if (ct.toLowerCase().contains("excel")) {
+        } else if (ct.toLowerCase(Locale.ROOT).contains("excel")) {
             ct = "excel";
-        } else if (ct.toLowerCase().contains("powerpoint")) {
+        } else if (ct.toLowerCase(Locale.ROOT).contains("powerpoint")) {
             ct = "powerpoint";
-        } else if (ct.toLowerCase().startsWith("video/")) {
+        } else if (ct.toLowerCase(Locale.ROOT).startsWith("video/")) {
             ct = "video";
-        } else if (ct.toLowerCase().startsWith("image/")) {
+        } else if (ct.toLowerCase(Locale.ROOT).startsWith("image/")) {
             ct = "image";
-        } else if (ct.toLowerCase().startsWith("audio/")) {
+        } else if (ct.toLowerCase(Locale.ROOT).startsWith("audio/")) {
             ct = "audio";
         } else {
             ct = "other";
diff --git 
a/core/src/main/java/org/apache/stormcrawler/persistence/AdaptiveScheduler.java 
b/core/src/main/java/org/apache/stormcrawler/persistence/AdaptiveScheduler.java
index 61c2330d..32727b04 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/persistence/AdaptiveScheduler.java
+++ 
b/core/src/main/java/org/apache/stormcrawler/persistence/AdaptiveScheduler.java
@@ -22,6 +22,7 @@ import java.util.Date;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Optional;
+import java.util.TimeZone;
 import org.apache.stormcrawler.Constants;
 import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.parse.filter.MD5SignatureParseFilter;
@@ -224,7 +225,7 @@ public class AdaptiveScheduler extends DefaultScheduler {
             return super.schedule(status, metadata);
         }
 
-        Calendar now = Calendar.getInstance(Locale.ROOT);
+        Calendar now = Calendar.getInstance(TimeZone.getTimeZone("GMT"), 
Locale.ROOT);
 
         String signatureModified = 
metadata.getFirstValue(SIGNATURE_MODIFIED_KEY);
 
diff --git 
a/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java 
b/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java
index f2e35bff..925ea716 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java
+++ 
b/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java
@@ -140,7 +140,7 @@ public class DefaultScheduler extends Scheduler {
             return Optional.empty();
         }
 
-        Calendar cal = Calendar.getInstance();
+        Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), 
Locale.ROOT);
         cal.add(Calendar.MINUTE, minutesIncrement);
 
         return Optional.of(cal.getTime());
diff --git 
a/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java 
b/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java
index 02692055..ee44f8aa 100644
--- a/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java
+++ b/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java
@@ -27,6 +27,7 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.LinkedHashSet;
 import java.util.List;
+import java.util.Locale;
 import java.util.StringTokenizer;
 import java.util.regex.Pattern;
 import javax.security.auth.login.Configuration;
@@ -104,7 +105,7 @@ public abstract class RobotRulesParser {
             throw new RuntimeException("Agent name not configured!");
         }
 
-        agentName = agentName.toLowerCase();
+        agentName = agentName.toLowerCase(Locale.ROOT);
         checkAgentValue(agentName);
 
         ArrayList<String> agents = new ArrayList<>();
@@ -115,13 +116,13 @@ public abstract class RobotRulesParser {
         if (configuredAgentNames.size() == 1) {
             StringTokenizer tok = new 
StringTokenizer(configuredAgentNames.get(0), ",");
             while (tok.hasMoreTokens()) {
-                String agent = tok.nextToken().trim().toLowerCase();
+                String agent = tok.nextToken().trim().toLowerCase(Locale.ROOT);
                 checkAgentValue(agent);
                 agents.add(agent);
             }
         } else {
             for (String ag : configuredAgentNames) {
-                String agent = ag.trim().toLowerCase();
+                String agent = ag.trim().toLowerCase(Locale.ROOT);
                 checkAgentValue(agent);
                 agents.add(agent);
             }
@@ -136,7 +137,7 @@ public abstract class RobotRulesParser {
             LOG.info(
                     "No agents listed in 'http.robots.agents' property! Using 
http.agent.name [{}]",
                     agentName);
-            this.agentNames.add(agentName.toLowerCase());
+            this.agentNames.add(agentName.toLowerCase(Locale.ROOT));
         } else {
             int index = 0;
             if ((agents.get(0)).equalsIgnoreCase(agentName)) {
diff --git 
a/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java 
b/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java
index 24bdfce8..16d6f93f 100644
--- a/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java
+++ b/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.lang.invoke.MethodHandles;
 import java.net.URL;
 import java.net.URLDecoder;
+import java.nio.charset.StandardCharsets;
 import java.text.SimpleDateFormat;
 import java.util.Date;
 import java.util.Locale;
@@ -139,6 +140,6 @@ public class FileResponse {
             }
         }
         sb.append("</urlset>");
-        return new String(sb).getBytes();
+        return new String(sb).getBytes(StandardCharsets.UTF_8);
     }
 }
diff --git 
a/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java
 
b/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java
index 2fc36f62..6478bb3c 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java
+++ 
b/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java
@@ -19,6 +19,7 @@ package org.apache.stormcrawler.protocol.httpclient;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.MalformedURLException;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Base64;
 import java.util.Collection;
@@ -120,7 +121,9 @@ public class HttpProtocol extends AbstractHttpProtocol
             String basicAuthPass = ConfUtils.getString(conf, 
"http.basicauth.password", "");
             String encoding =
                     Base64.getEncoder()
-                            .encodeToString((basicAuthUser + ":" + 
basicAuthPass).getBytes());
+                            .encodeToString(
+                                    (basicAuthUser + ":" + basicAuthPass)
+                                            .getBytes(StandardCharsets.UTF_8));
             defaultHeaders.add(new BasicHeader("Authorization", "Basic " + 
encoding));
         }
 
diff --git 
a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java 
b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java
index 3a8cfffa..01902713 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java
+++ 
b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java
@@ -21,6 +21,7 @@ import java.net.InetSocketAddress;
 import java.net.MalformedURLException;
 import java.net.Proxy;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.security.cert.CertificateException;
 import java.util.*;
 import java.util.concurrent.TimeUnit;
@@ -200,7 +201,9 @@ public class HttpProtocol extends AbstractHttpProtocol {
             final String basicAuthPass = ConfUtils.getString(conf, 
"http.basicauth.password", "");
             final String encoding =
                     Base64.getEncoder()
-                            .encodeToString((basicAuthUser + ":" + 
basicAuthPass).getBytes());
+                            .encodeToString(
+                                    (basicAuthUser + ":" + basicAuthPass)
+                                            .getBytes(StandardCharsets.UTF_8));
             customRequestHeaders.add(new KeyValue("Authorization", "Basic " + 
encoding));
         }
 
@@ -289,7 +292,7 @@ public class HttpProtocol extends AbstractHttpProtocol {
                 // format SCProxy into native Java proxy
                 Proxy proxy =
                         new Proxy(
-                                
Proxy.Type.valueOf(prox.getProtocol().toUpperCase()),
+                                
Proxy.Type.valueOf(prox.getProtocol().toUpperCase(Locale.ROOT)),
                                 new InetSocketAddress(
                                         prox.getAddress(), 
Integer.parseInt(prox.getPort())));
 
@@ -395,7 +398,9 @@ public class HttpProtocol extends AbstractHttpProtocol {
 
                 if (key.equals(ProtocolResponse.REQUEST_HEADERS_KEY)
                         || key.equals(ProtocolResponse.RESPONSE_HEADERS_KEY)) {
-                    value = new String(Base64.getDecoder().decode(value));
+                    value =
+                            new String(
+                                    Base64.getDecoder().decode(value), 
StandardCharsets.ISO_8859_1);
                 }
 
                 responsemetadata.addValue(key.toLowerCase(Locale.ROOT), value);
@@ -569,10 +574,18 @@ public class HttpProtocol extends AbstractHttpProtocol {
             responseverbatim.append("\r\n");
 
             final byte[] encodedBytesResponse =
-                    
Base64.getEncoder().encode(responseverbatim.toString().getBytes());
+                    Base64.getEncoder()
+                            .encode(
+                                    responseverbatim
+                                            .toString()
+                                            
.getBytes(StandardCharsets.ISO_8859_1));
 
             final byte[] encodedBytesRequest =
-                    
Base64.getEncoder().encode(requestverbatim.toString().getBytes());
+                    Base64.getEncoder()
+                            .encode(
+                                    requestverbatim
+                                            .toString()
+                                            
.getBytes(StandardCharsets.ISO_8859_1));
 
             final StringBuilder protocols = new 
StringBuilder(response.protocol().toString());
             final Handshake handshake = connection.handshake();
@@ -583,8 +596,12 @@ public class HttpProtocol extends AbstractHttpProtocol {
 
             // returns a modified version of the response
             return response.newBuilder()
-                    .header(ProtocolResponse.REQUEST_HEADERS_KEY, new 
String(encodedBytesRequest))
-                    .header(ProtocolResponse.RESPONSE_HEADERS_KEY, new 
String(encodedBytesResponse))
+                    .header(
+                            ProtocolResponse.REQUEST_HEADERS_KEY,
+                            new String(encodedBytesRequest, 
StandardCharsets.ISO_8859_1))
+                    .header(
+                            ProtocolResponse.RESPONSE_HEADERS_KEY,
+                            new String(encodedBytesResponse, 
StandardCharsets.ISO_8859_1))
                     .header(ProtocolResponse.RESPONSE_IP_KEY, ipAddress)
                     .header(ProtocolResponse.REQUEST_TIME_KEY, 
Long.toString(startFetchTime))
                     .header(ProtocolResponse.PROTOCOL_VERSIONS_KEY, 
protocols.toString())
diff --git 
a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
 
b/core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
index 21f2d175..23a318b8 100644
--- 
a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
+++ 
b/core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
@@ -16,6 +16,7 @@
  */
 package org.apache.stormcrawler.protocol.selenium;
 
+import java.nio.charset.StandardCharsets;
 import java.time.Instant;
 import java.util.concurrent.LinkedBlockingQueue;
 import org.apache.storm.Config;
@@ -77,7 +78,7 @@ public abstract class SeleniumProtocol extends 
AbstractHttpProtocol {
             outputMeta.addValue(MD_KEY_END, Instant.now().toString());
 
             // if no filters got triggered
-            byte[] content = driver.getPageSource().getBytes();
+            byte[] content = 
driver.getPageSource().getBytes(StandardCharsets.UTF_8);
             return new ProtocolResponse(content, 200, outputMeta);
 
         } finally {
diff --git 
a/core/src/main/java/org/apache/stormcrawler/proxy/MultiProxyManager.java 
b/core/src/main/java/org/apache/stormcrawler/proxy/MultiProxyManager.java
index fe5e53af..016f06bc 100644
--- a/core/src/main/java/org/apache/stormcrawler/proxy/MultiProxyManager.java
+++ b/core/src/main/java/org/apache/stormcrawler/proxy/MultiProxyManager.java
@@ -17,9 +17,9 @@
 package org.apache.stormcrawler.proxy;
 
 import java.io.File;
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.net.URL;
+import java.nio.charset.StandardCharsets;
 import java.util.Random;
 import java.util.Scanner;
 import java.util.concurrent.atomic.AtomicInteger;
@@ -95,7 +95,7 @@ public class MultiProxyManager implements ProxyManager {
         // conditionally load file from resources
         if (resourcesProxyFilePath != null) {
             try {
-                scanner = new Scanner(resourcesProxyFilePath.openStream());
+                scanner = new Scanner(resourcesProxyFilePath.openStream(), 
StandardCharsets.UTF_8);
             } catch (IOException e) {
                 throw new RuntimeException("failed to load proxy resource 
file: " + proxyFile, e);
             }
@@ -105,8 +105,8 @@ public class MultiProxyManager implements ProxyManager {
 
             // create new scanner to read file line-by-line
             try {
-                scanner = new Scanner(proxyFileObj);
-            } catch (FileNotFoundException e) {
+                scanner = new Scanner(proxyFileObj, StandardCharsets.UTF_8);
+            } catch (IOException e) {
                 throw new RuntimeException("failed to load proxy file: " + 
proxyFile, e);
             }
         }
diff --git 
a/core/src/main/java/org/apache/stormcrawler/proxy/SingleProxyManager.java 
b/core/src/main/java/org/apache/stormcrawler/proxy/SingleProxyManager.java
index c986f304..ff6c8b63 100644
--- a/core/src/main/java/org/apache/stormcrawler/proxy/SingleProxyManager.java
+++ b/core/src/main/java/org/apache/stormcrawler/proxy/SingleProxyManager.java
@@ -16,6 +16,7 @@
  */
 package org.apache.stormcrawler.proxy;
 
+import java.util.Locale;
 import org.apache.storm.Config;
 import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.util.ConfUtils;
@@ -42,7 +43,7 @@ public class SingleProxyManager implements ProxyManager {
         String proxyPassword = ConfUtils.getString(conf, "http.proxy.pass", 
null);
 
         // assemble proxy connection string
-        String proxyString = proxyType.toLowerCase() + "://";
+        String proxyString = proxyType.toLowerCase(Locale.ROOT) + "://";
 
         // conditionally append authentication info
         if (proxyUsername != null
@@ -53,7 +54,9 @@ public class SingleProxyManager implements ProxyManager {
         }
 
         // complete proxy string and create proxy
-        this.proxy = new SCProxy(proxyString + String.format("%s:%d", 
proxyHost, proxyPort));
+        this.proxy =
+                new SCProxy(
+                        proxyString + String.format(Locale.ROOT, "%s:%d", 
proxyHost, proxyPort));
     }
 
     @Override
diff --git a/core/src/main/java/org/apache/stormcrawler/spout/FileSpout.java 
b/core/src/main/java/org/apache/stormcrawler/spout/FileSpout.java
index 6d5d6f64..773b90a2 100644
--- a/core/src/main/java/org/apache/stormcrawler/spout/FileSpout.java
+++ b/core/src/main/java/org/apache/stormcrawler/spout/FileSpout.java
@@ -233,7 +233,7 @@ public class FileSpout extends BaseRichSpout {
     @Override
     public void fail(Object msgId) {
         if (msgId instanceof byte[]) {
-            String msg = new String((byte[]) msgId);
+            String msg = new String((byte[]) msgId, StandardCharsets.UTF_8);
             LOG.error("Failed - adding back to the queue: {}", msg);
             buffer.add((byte[]) msgId);
         } else {
diff --git 
a/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java 
b/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java
index 85691eed..0dc055ec 100644
--- a/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java
+++ b/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java
@@ -62,13 +62,13 @@ public class CookieConverter {
             for (int i = 1; i < tokens.length; i++) {
                 String ti = tokens[i].trim();
                 if (ti.equalsIgnoreCase("secure")) secure = true;
-                if (ti.toLowerCase().startsWith("path=")) {
+                if (ti.toLowerCase(Locale.ROOT).startsWith("path=")) {
                     path = ti.substring(5);
                 }
-                if (ti.toLowerCase().startsWith("domain=")) {
+                if (ti.toLowerCase(Locale.ROOT).startsWith("domain=")) {
                     domain = ti.substring(7);
                 }
-                if (ti.toLowerCase().startsWith("expires=")) {
+                if (ti.toLowerCase(Locale.ROOT).startsWith("expires=")) {
                     expires = ti.substring(8);
                 }
             }
diff --git 
a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java 
b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java
index 3dbc206f..706ef71f 100644
--- a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java
+++ b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java
@@ -49,7 +49,7 @@ public class PerSecondReducer implements 
IReducer<TimeReducerState> {
         double msec = System.currentTimeMillis() - accumulator.started;
         if (msec == 0) return 0;
         double permsec = accumulator.sum / msec;
-        return new Double(permsec * 1000d);
+        return Double.valueOf(permsec * 1000d);
     }
 }
 
diff --git 
a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java 
b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
index d1f35cc1..00b0f127 100644
--- a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
@@ -17,6 +17,7 @@
 package org.apache.stormcrawler.bolt;
 
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -190,7 +191,7 @@ public class JSoupParserBoltTest extends ParsingTester {
 
         for (int i = 0; i < tests.length; i++) {
 
-            byte[] bytes = tests[i].getBytes();
+            byte[] bytes = tests[i].getBytes(StandardCharsets.UTF_8);
 
             parse("http://www.digitalpebble.com";, bytes, new Metadata());
 
diff --git 
a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java
 
b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java
index 940efbd2..e7620fc7 100644
--- 
a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java
+++ 
b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java
@@ -27,6 +27,7 @@ import java.net.URL;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.filtering.basic.BasicURLNormalizer;
@@ -286,13 +287,13 @@ public class BasicURLNormalizerTest {
         URL testSourceUrl = new URL("http://blablabla.org/";);
 
         String inputURL = "HTTP://www.quanjing.com/";
-        String expectedResult = inputURL.toLowerCase();
+        String expectedResult = inputURL.toLowerCase(Locale.ROOT);
         String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), 
inputURL);
 
         assertEquals("Failed to filter query string", expectedResult, 
normalizedUrl);
 
         inputURL = "http://www.QUANJING.COM/";;
-        expectedResult = inputURL.toLowerCase();
+        expectedResult = inputURL.toLowerCase(Locale.ROOT);
         normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(), 
inputURL);
 
         assertEquals("Failed to filter query string", expectedResult, 
normalizedUrl);
diff --git 
a/core/src/test/java/org/apache/stormcrawler/indexer/IndexerTester.java 
b/core/src/test/java/org/apache/stormcrawler/indexer/IndexerTester.java
index 7dffe7ba..ddd193bb 100644
--- a/core/src/test/java/org/apache/stormcrawler/indexer/IndexerTester.java
+++ b/core/src/test/java/org/apache/stormcrawler/indexer/IndexerTester.java
@@ -20,6 +20,7 @@ import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.util.Map;
 import org.apache.storm.task.OutputCollector;
 import org.apache.storm.tuple.Tuple;
@@ -59,7 +60,8 @@ public class IndexerTester {
 
     protected void index(String url, String content, Metadata metadata) throws 
IOException {
         Tuple tuple = mock(Tuple.class);
-        when(tuple.getBinaryByField("content")).thenReturn(content.getBytes());
+        when(tuple.getBinaryByField("content"))
+                .thenReturn(content.getBytes(StandardCharsets.UTF_8));
         when(tuple.getStringByField("url")).thenReturn(url);
         when(tuple.getValueByField("metadata")).thenReturn(metadata);
         bolt.execute(tuple);
diff --git 
a/core/src/test/java/org/apache/stormcrawler/persistence/AdaptiveSchedulerTest.java
 
b/core/src/test/java/org/apache/stormcrawler/persistence/AdaptiveSchedulerTest.java
index 4f51f696..1e7c6e78 100644
--- 
a/core/src/test/java/org/apache/stormcrawler/persistence/AdaptiveSchedulerTest.java
+++ 
b/core/src/test/java/org/apache/stormcrawler/persistence/AdaptiveSchedulerTest.java
@@ -25,8 +25,10 @@ import java.util.Calendar;
 import java.util.Date;
 import java.util.GregorianCalendar;
 import java.util.HashMap;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Optional;
+import java.util.TimeZone;
 import org.apache.commons.lang.time.DateUtils;
 import org.apache.stormcrawler.Metadata;
 import org.apache.stormcrawler.protocol.HttpHeaders;
@@ -64,7 +66,7 @@ public class AdaptiveSchedulerTest {
         metadata.addValue("fetch.statusCode", "200");
         Optional<Date> nextFetch = scheduler.schedule(Status.FETCHED, 
metadata);
 
-        Calendar cal = Calendar.getInstance();
+        Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), 
Locale.ROOT);
         cal.add(Calendar.MINUTE, 6);
         Assert.assertEquals(
                 DateUtils.round(cal.getTime(), Calendar.SECOND),
@@ -72,7 +74,7 @@ public class AdaptiveSchedulerTest {
 
         nextFetch = scheduler.schedule(Status.ERROR, metadata);
 
-        cal = Calendar.getInstance();
+        cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
         cal.add(Calendar.MINUTE, 8);
         Assert.assertEquals(
                 DateUtils.round(cal.getTime(), Calendar.SECOND),
@@ -89,7 +91,11 @@ public class AdaptiveSchedulerTest {
         metadata.addValue(AdaptiveScheduler.SIGNATURE_KEY, md5sumEmptyContent);
         scheduler.schedule(Status.FETCHED, metadata);
         Instant firstFetch =
-                DateUtils.round(Calendar.getInstance().getTime(), 
Calendar.SECOND).toInstant();
+                DateUtils.round(
+                                
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT)
+                                        .getTime(),
+                                Calendar.SECOND)
+                        .toInstant();
 
         /* verify initial fetch interval and last-modified time */
         String lastModified = 
metadata.getFirstValue(HttpHeaders.LAST_MODIFIED);
@@ -135,7 +141,11 @@ public class AdaptiveSchedulerTest {
         metadata.addValue(AdaptiveScheduler.SIGNATURE_KEY, md5sumSpaceContent);
         scheduler.schedule(Status.FETCHED, metadata);
         Instant lastFetch =
-                DateUtils.round(Calendar.getInstance().getTime(), 
Calendar.SECOND).toInstant();
+                DateUtils.round(
+                                
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT)
+                                        .getTime(),
+                                Calendar.SECOND)
+                        .toInstant();
         fetchInterval = 
metadata.getFirstValue(AdaptiveScheduler.FETCH_INTERVAL_KEY);
         Assert.assertNotNull(fetchInterval);
         /* interval should now shrink */
diff --git 
a/core/src/test/java/org/apache/stormcrawler/persistence/DefaultSchedulerTest.java
 
b/core/src/test/java/org/apache/stormcrawler/persistence/DefaultSchedulerTest.java
index 451a606a..192ddfc4 100644
--- 
a/core/src/test/java/org/apache/stormcrawler/persistence/DefaultSchedulerTest.java
+++ 
b/core/src/test/java/org/apache/stormcrawler/persistence/DefaultSchedulerTest.java
@@ -20,8 +20,10 @@ import java.net.MalformedURLException;
 import java.util.Calendar;
 import java.util.Date;
 import java.util.HashMap;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Optional;
+import java.util.TimeZone;
 import org.apache.commons.lang.time.DateUtils;
 import org.apache.stormcrawler.Metadata;
 import org.junit.Assert;
@@ -40,7 +42,7 @@ public class DefaultSchedulerTest {
         metadata.addValue("testKey", "someValue");
         Optional<Date> nextFetch = scheduler.schedule(Status.FETCHED, 
metadata);
 
-        Calendar cal = Calendar.getInstance();
+        Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), 
Locale.ROOT);
         cal.add(Calendar.MINUTE, 360);
         Assert.assertEquals(
                 DateUtils.round(cal.getTime(), Calendar.SECOND),
@@ -48,7 +50,7 @@ public class DefaultSchedulerTest {
 
         nextFetch = scheduler.schedule(Status.ERROR, metadata);
 
-        cal = Calendar.getInstance();
+        cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
         cal.add(Calendar.MINUTE, 3600);
         Assert.assertEquals(
                 DateUtils.round(cal.getTime(), Calendar.SECOND),
@@ -66,7 +68,7 @@ public class DefaultSchedulerTest {
         metadata.addValue("testKey.key2", "someValue");
         Optional<Date> nextFetch = scheduler.schedule(Status.FETCHED, 
metadata);
 
-        Calendar cal = Calendar.getInstance();
+        Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), 
Locale.ROOT);
         cal.add(Calendar.MINUTE, 360);
         Assert.assertEquals(
                 DateUtils.round(cal.getTime(), Calendar.SECOND),
diff --git 
a/core/src/test/java/org/apache/stormcrawler/proxy/MultiProxyManagerTest.java 
b/core/src/test/java/org/apache/stormcrawler/proxy/MultiProxyManagerTest.java
index e587652b..5aa01dab 100644
--- 
a/core/src/test/java/org/apache/stormcrawler/proxy/MultiProxyManagerTest.java
+++ 
b/core/src/test/java/org/apache/stormcrawler/proxy/MultiProxyManagerTest.java
@@ -18,6 +18,7 @@ package org.apache.stormcrawler.proxy;
 
 import java.io.FileWriter;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Paths;
 import org.apache.storm.Config;
@@ -53,7 +54,7 @@ public class MultiProxyManagerTest {
             "sock5://example.com:64000",
         };
 
-        FileWriter writer = new FileWriter("/tmp/proxies.txt");
+        FileWriter writer = new FileWriter("/tmp/proxies.txt", 
StandardCharsets.UTF_8);
         for (String proxyString : proxyStrings) {
             writer.write("# fake comment to test" + "\n");
             writer.write("// fake comment to test" + "\n");
diff --git 
a/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchIndexerBolt.java
 
b/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchIndexerBolt.java
index 2d8027f5..03618df7 100644
--- 
a/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchIndexerBolt.java
+++ 
b/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchIndexerBolt.java
@@ -46,6 +46,7 @@ import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Map.Entry;
 import org.apache.commons.io.FileUtils;
@@ -73,7 +74,7 @@ public class CloudSearchIndexerBolt extends 
AbstractIndexerBolt {
     private static final int MAX_SIZE_DOC_BYTES = 1048576;
 
     private static final SimpleDateFormat DATE_FORMAT =
-            new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
+            new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", Locale.ROOT);
 
     private AmazonCloudSearchDomainClient client;
 
diff --git 
a/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java
 
b/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java
index cb912a63..e35916cc 100644
--- 
a/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java
+++ 
b/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java
@@ -19,6 +19,7 @@ package org.apache.stormcrawler.aws.bolt;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
+import java.util.Locale;
 import java.util.regex.Pattern;
 import org.apache.commons.codec.binary.Hex;
 
@@ -73,7 +74,7 @@ public class CloudSearchUtils {
      * @return
      */
     public static String cleanFieldName(String name) {
-        String lowercase = name.toLowerCase();
+        String lowercase = name.toLowerCase(Locale.ROOT);
         lowercase = lowercase.replaceAll("[^a-z_0-9]", "_");
         if (lowercase.length() < 3 || lowercase.length() > 64)
             throw new RuntimeException("Field name must be between 3 and 64 
chars : " + lowercase);
diff --git 
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java
 
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java
index 3014f2c6..3a336783 100644
--- 
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java
+++ 
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java
@@ -23,6 +23,7 @@ import java.text.SimpleDateFormat;
 import java.util.Collection;
 import java.util.Date;
 import java.util.Iterator;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Map.Entry;
 import org.apache.storm.metric.api.IMetricsConsumer;
@@ -75,7 +76,7 @@ public class MetricsConsumer implements IMetricsConsumer {
         indexName = ConfUtils.getString(stormConf, 
OSMetricsIndexNameParamName, "metrics");
         stormID = context.getStormId();
         if (registrationArgument != null) {
-            dateFormat = new SimpleDateFormat((String) registrationArgument);
+            dateFormat = new SimpleDateFormat((String) registrationArgument, 
Locale.ROOT);
             LOG.info("Using date format {}", registrationArgument);
         }
         try {
diff --git 
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java
 
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java
index 696c2808..8d786260 100644
--- 
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java
+++ 
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java
@@ -23,8 +23,10 @@ import java.util.Calendar;
 import java.util.Date;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
+import java.util.TimeZone;
 import java.util.concurrent.TimeUnit;
 import org.apache.commons.lang.StringUtils;
 import org.apache.storm.spout.SpoutOutputCollector;
@@ -305,16 +307,17 @@ public class AggregationSpout extends AbstractSpout 
implements ActionListener<Se
         // returned in the query and add to it, unless the previous value is
         // within n mins in which case we'll keep it
         if (mostRecentDateFound != null && recentDateIncrease >= 0) {
-            Calendar potentialNewDate = Calendar.getInstance();
+            Calendar potentialNewDate =
+                    Calendar.getInstance(TimeZone.getTimeZone("GMT"), 
Locale.ROOT);
             
potentialNewDate.setTimeInMillis(mostRecentDateFound.toEpochMilli());
             potentialNewDate.add(Calendar.MINUTE, recentDateIncrease);
             Date oldDate = null;
             // check boundaries
             if (this.recentDateMinGap > 0) {
-                Calendar low = Calendar.getInstance();
+                Calendar low = 
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
                 low.setTime(queryDate);
                 low.add(Calendar.MINUTE, -recentDateMinGap);
-                Calendar high = Calendar.getInstance();
+                Calendar high = 
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
                 high.setTime(queryDate);
                 high.add(Calendar.MINUTE, recentDateMinGap);
                 if (high.before(potentialNewDate) || 
low.after(potentialNewDate)) {
diff --git 
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java
 
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java
index 7e52a8f0..1fdfd491 100644
--- 
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java
+++ 
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java
@@ -26,6 +26,7 @@ import java.util.Date;
 import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Optional;
@@ -125,13 +126,19 @@ public class StatusUpdaterBolt extends 
AbstractStatusUpdaterBolt
         indexName =
                 ConfUtils.getString(
                         stormConf,
-                        
String.format(StatusUpdaterBolt.OSStatusIndexNameParamName, OSBoltType),
+                        String.format(
+                                Locale.ROOT,
+                                StatusUpdaterBolt.OSStatusIndexNameParamName,
+                                OSBoltType),
                         "status");
 
         doRouting =
                 ConfUtils.getBoolean(
                         stormConf,
-                        
String.format(StatusUpdaterBolt.OSStatusRoutingParamName, OSBoltType),
+                        String.format(
+                                Locale.ROOT,
+                                StatusUpdaterBolt.OSStatusRoutingParamName,
+                                OSBoltType),
                         false);
 
         partitioner = new URLPartitioner();
@@ -140,7 +147,10 @@ public class StatusUpdaterBolt extends 
AbstractStatusUpdaterBolt
         fieldNameForRoutingKey =
                 ConfUtils.getString(
                         stormConf,
-                        
String.format(StatusUpdaterBolt.OSStatusRoutingFieldParamName, OSBoltType));
+                        String.format(
+                                Locale.ROOT,
+                                
StatusUpdaterBolt.OSStatusRoutingFieldParamName,
+                                OSBoltType));
         if (StringUtils.isNotBlank(fieldNameForRoutingKey)) {
             if (fieldNameForRoutingKey.startsWith("metadata.")) {
                 routingFieldNameInMetadata = true;
diff --git 
a/external/solr/src/main/java/org/apache/stormcrawler/solr/metrics/MetricsConsumer.java
 
b/external/solr/src/main/java/org/apache/stormcrawler/solr/metrics/MetricsConsumer.java
index 982f562c..637f7d24 100644
--- 
a/external/solr/src/main/java/org/apache/stormcrawler/solr/metrics/MetricsConsumer.java
+++ 
b/external/solr/src/main/java/org/apache/stormcrawler/solr/metrics/MetricsConsumer.java
@@ -21,6 +21,7 @@ import java.text.SimpleDateFormat;
 import java.util.Collection;
 import java.util.Date;
 import java.util.Iterator;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Map.Entry;
 import org.apache.solr.common.SolrInputDocument;
@@ -36,7 +37,7 @@ public class MetricsConsumer implements IMetricsConsumer {
 
     private final Logger LOG = LoggerFactory.getLogger(MetricsConsumer.class);
 
-    private final DateFormat df = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
+    private final DateFormat df = new 
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ROOT);
 
     private static final String BOLT_TYPE = "metrics";
 
diff --git 
a/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/StatusUpdaterBolt.java
 
b/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/StatusUpdaterBolt.java
index 3f872870..562d03e0 100644
--- 
a/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/StatusUpdaterBolt.java
+++ 
b/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/StatusUpdaterBolt.java
@@ -18,6 +18,7 @@ package org.apache.stormcrawler.solr.persistence;
 
 import java.util.Date;
 import java.util.Iterator;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Optional;
 import org.apache.solr.common.SolrInputDocument;
@@ -83,7 +84,7 @@ public class StatusUpdaterBolt extends 
AbstractStatusUpdaterBolt {
         while (keyIterator.hasNext()) {
             String key = keyIterator.next();
             String[] values = metadata.getValues(key);
-            doc.setField(String.format("%s.%s", mdPrefix, key), values);
+            doc.setField(String.format(Locale.ROOT, "%s.%s", mdPrefix, key), 
values);
         }
 
         if (nextFetch.isPresent()) {
diff --git 
a/external/sql/src/main/java/org/apache/stormcrawler/sql/SQLSpout.java 
b/external/sql/src/main/java/org/apache/stormcrawler/sql/SQLSpout.java
index 8207130e..facb1409 100644
--- a/external/sql/src/main/java/org/apache/stormcrawler/sql/SQLSpout.java
+++ b/external/sql/src/main/java/org/apache/stormcrawler/sql/SQLSpout.java
@@ -17,6 +17,7 @@
 package org.apache.stormcrawler.sql;
 
 import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
 import java.sql.Connection;
 import java.sql.ResultSet;
 import java.sql.SQLException;
@@ -173,7 +174,8 @@ public class SQLSpout extends AbstractQueryingSpout {
                     metadata = "\t" + metadata;
                 }
                 String URLMD = url + metadata;
-                List<Object> v = 
SCHEME.deserialize(ByteBuffer.wrap(URLMD.getBytes()));
+                List<Object> v =
+                        
SCHEME.deserialize(ByteBuffer.wrap(URLMD.getBytes(StandardCharsets.UTF_8)));
                 buffer.add(url, (Metadata) v.get(1));
             }
 
diff --git 
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCFileNameFormat.java
 
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCFileNameFormat.java
index 097650b0..319ef1cb 100644
--- 
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCFileNameFormat.java
+++ 
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCFileNameFormat.java
@@ -18,6 +18,7 @@ package org.apache.stormcrawler.warc;
 
 import java.text.SimpleDateFormat;
 import java.util.Date;
+import java.util.Locale;
 import java.util.Map;
 import java.util.TimeZone;
 import org.apache.storm.hdfs.bolt.format.FileNameFormat;
@@ -70,18 +71,18 @@ public class WARCFileNameFormat implements FileNameFormat {
 
     @Override
     public String getName(long rotation, long timeStamp) {
-        SimpleDateFormat fileDate = new SimpleDateFormat("yyyyMMddHHmmss");
+        SimpleDateFormat fileDate = new SimpleDateFormat("yyyyMMddHHmmss", 
Locale.ROOT);
         fileDate.setTimeZone(TimeZone.getTimeZone("GMT"));
         String taskindexString = "";
         if (this.taskIndex != -1) {
-            taskindexString = String.format("%02d", this.taskIndex) + "-";
+            taskindexString = String.format(Locale.ROOT, "%02d", 
this.taskIndex) + "-";
         }
         return this.prefix
                 + "-"
                 + fileDate.format(new Date(timeStamp))
                 + "-"
                 + taskindexString
-                + String.format("%05d", rotation)
+                + String.format(Locale.ROOT, "%05d", rotation)
                 + this.extension;
     }
 
diff --git 
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRecordFormat.java
 
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRecordFormat.java
index 2b8bf873..aba12641 100644
--- 
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRecordFormat.java
+++ 
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRecordFormat.java
@@ -330,7 +330,7 @@ public class WARCRecordFormat implements RecordFormat {
         if (StringUtils.isNotBlank(headersVerbatim)) {
             WARCTypeValue = WARC_TYPE_RESPONSE;
             headersVerbatim = fixHttpHeaders(headersVerbatim, content.length);
-            httpheaders = headersVerbatim.getBytes();
+            httpheaders = headersVerbatim.getBytes(StandardCharsets.UTF_8);
         }
 
         StringBuilder buffer = new StringBuilder();
diff --git 
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java
 
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java
index 63a45dbf..c91a21e1 100644
--- 
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java
+++ 
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java
@@ -58,7 +58,7 @@ public class WARCRequestRecordFormat extends WARCRecordFormat 
{
             LOG.warn("No request header for {}", url);
             return new byte[] {};
         }
-        final byte[] httpheaders = fixHttpHeaders(headersVerbatim).getBytes();
+        final byte[] httpheaders = 
fixHttpHeaders(headersVerbatim).getBytes(StandardCharsets.UTF_8);
 
         StringBuilder buffer = new StringBuilder();
         buffer.append(WARC_VERSION);
diff --git a/pom.xml b/pom.xml
index 375763ee..aa76643d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -338,6 +338,33 @@ under the License.
                                        </execution>
                                </executions>
                        </plugin>
+                       <plugin>
+                               <groupId>de.thetaphi</groupId>
+                               <artifactId>forbiddenapis</artifactId>
+                               <version>3.7</version>
+                               <configuration>
+                                       <targetVersion>11</targetVersion>
+                                       
<ignoreSignaturesOfMissingClasses>true</ignoreSignaturesOfMissingClasses>
+                                       
<failOnUnsupportedJava>false</failOnUnsupportedJava>
+                                       
<excludes>test-documents/*.class</excludes>
+                                       <bundledSignatures>
+                                               
<bundledSignature>jdk-unsafe</bundledSignature>
+                                               
<bundledSignature>jdk-deprecated</bundledSignature>
+                                               
<bundledSignature>jdk-non-portable</bundledSignature>
+                                               
<bundledSignature>jdk-internal</bundledSignature>
+                                               <!-- replace with 
${commons.io.version} when available -->
+                                               
<bundledSignature>commons-io-unsafe-2.11.0</bundledSignature>
+                                       </bundledSignatures>
+                               </configuration>
+                               <executions>
+                                       <execution>
+                                               <goals>
+                                                       <goal>check</goal>
+                                                       <goal>testCheck</goal>
+                                               </goals>
+                                       </execution>
+                               </executions>
+                       </plugin>
                        <plugin>
                                <groupId>org.codehaus.mojo</groupId>
                                <artifactId>license-maven-plugin</artifactId>
@@ -485,8 +512,8 @@ under the License.
                                <type>pom</type>
                                <scope>import</scope>
                        </dependency>
-
-            <!-- Necessary until https://github.com/apache/storm/pull/3626 -->
+                       <!-- Necessary until 
https://github.com/apache/storm/pull/3626 -->
+                       <!-- update forbidden-apis when you update this -->
                        <dependency>
                                <groupId>commons-io</groupId>
                                <artifactId>commons-io</artifactId>

Reply via email to