This is an automated email from the ASF dual-hosted git repository.
jnioche pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-stormcrawler.git
The following commit(s) were added to refs/heads/main by this push:
new c1088fb3 #1207 -- add forbidden-apis (#1208)
c1088fb3 is described below
commit c1088fb3ff3ca9ca99bcce108d8bb2b40b97c094
Author: Tim Allison <[email protected]>
AuthorDate: Fri May 3 11:54:07 2024 -0400
#1207 -- add forbidden-apis (#1208)
* Add forbidden-apis -- this just adds the plugin. I'll update the repo to
pass it in follow-on commits. This is just a WIP.
* Add forbidden-apis -- stormcrawler-core
* Add forbidden-apis -- rest of the project
---
.../apache/stormcrawler/bolt/FeedParserBolt.java | 8 ++++--
.../apache/stormcrawler/bolt/JSoupParserBolt.java | 3 ++-
.../stormcrawler/bolt/SiteMapParserBolt.java | 10 ++++---
.../filtering/regex/FastURLFilter.java | 5 ++--
.../filtering/regex/RegexURLNormalizer.java | 6 +++--
.../apache/stormcrawler/parse/JSoupFilters.java | 3 ++-
.../apache/stormcrawler/parse/ParseFilters.java | 3 ++-
.../apache/stormcrawler/parse/TextExtractor.java | 3 ++-
.../parse/filter/MimeTypeNormalization.java | 17 ++++++------
.../persistence/AdaptiveScheduler.java | 3 ++-
.../stormcrawler/persistence/DefaultScheduler.java | 2 +-
.../stormcrawler/protocol/RobotRulesParser.java | 9 ++++---
.../stormcrawler/protocol/file/FileResponse.java | 3 ++-
.../protocol/httpclient/HttpProtocol.java | 5 +++-
.../stormcrawler/protocol/okhttp/HttpProtocol.java | 31 +++++++++++++++++-----
.../protocol/selenium/SeleniumProtocol.java | 3 ++-
.../stormcrawler/proxy/MultiProxyManager.java | 8 +++---
.../stormcrawler/proxy/SingleProxyManager.java | 7 +++--
.../org/apache/stormcrawler/spout/FileSpout.java | 2 +-
.../apache/stormcrawler/util/CookieConverter.java | 6 ++---
.../apache/stormcrawler/util/PerSecondReducer.java | 2 +-
.../stormcrawler/bolt/JSoupParserBoltTest.java | 3 ++-
.../filtering/BasicURLNormalizerTest.java | 5 ++--
.../apache/stormcrawler/indexer/IndexerTester.java | 4 ++-
.../persistence/AdaptiveSchedulerTest.java | 18 ++++++++++---
.../persistence/DefaultSchedulerTest.java | 8 +++---
.../stormcrawler/proxy/MultiProxyManagerTest.java | 3 ++-
.../aws/bolt/CloudSearchIndexerBolt.java | 3 ++-
.../stormcrawler/aws/bolt/CloudSearchUtils.java | 3 ++-
.../opensearch/metrics/MetricsConsumer.java | 3 ++-
.../opensearch/persistence/AggregationSpout.java | 9 ++++---
.../opensearch/persistence/StatusUpdaterBolt.java | 16 ++++++++---
.../stormcrawler/solr/metrics/MetricsConsumer.java | 3 ++-
.../solr/persistence/StatusUpdaterBolt.java | 3 ++-
.../java/org/apache/stormcrawler/sql/SQLSpout.java | 4 ++-
.../stormcrawler/warc/WARCFileNameFormat.java | 7 ++---
.../apache/stormcrawler/warc/WARCRecordFormat.java | 2 +-
.../stormcrawler/warc/WARCRequestRecordFormat.java | 2 +-
pom.xml | 31 ++++++++++++++++++++--
39 files changed, 187 insertions(+), 79 deletions(-)
diff --git
a/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java
b/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java
index 112ac816..d1dde17c 100644
--- a/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/bolt/FeedParserBolt.java
@@ -25,12 +25,15 @@ import com.rometools.rome.feed.synd.SyndFeed;
import com.rometools.rome.io.SyndFeedInput;
import java.io.ByteArrayInputStream;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
+import java.util.TimeZone;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
@@ -85,7 +88,7 @@ public class FeedParserBolt extends StatusEmitterBolt {
isfeed = true;
} else {
// try based on the first bytes?
- byte[] clue = "<rss ".getBytes();
+ byte[] clue = "<rss ".getBytes(StandardCharsets.UTF_8);
byte[] beginning = content;
final int maxOffsetGuess = 100;
if (content.length > maxOffsetGuess) {
@@ -195,7 +198,8 @@ public class FeedParserBolt extends StatusEmitterBolt {
if (publishedDate != null) {
// filter based on the published date
if (filterHoursSincePub != -1) {
- Calendar rightNow = Calendar.getInstance();
+ Calendar rightNow =
+ Calendar.getInstance(TimeZone.getTimeZone("GMT"),
Locale.ROOT);
rightNow.add(Calendar.HOUR, -filterHoursSincePub);
if (publishedDate.before(rightNow.getTime())) {
LOG.info(
diff --git
a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
index 67aa673e..44e994ef 100644
--- a/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/bolt/JSoupParserBolt.java
@@ -28,6 +28,7 @@ import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.metric.api.MultiCountMetric;
@@ -190,7 +191,7 @@ public class JSoupParserBolt extends StatusEmitterBolt {
}
if (StringUtils.isNotBlank(mimeType)) {
- if (mimeType.toLowerCase().contains("html")) {
+ if (mimeType.toLowerCase(Locale.ROOT).contains("html")) {
CT_OK = true;
}
}
diff --git
a/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java
b/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java
index 22fbb20d..7e2c88f0 100644
--- a/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java
+++ b/core/src/main/java/org/apache/stormcrawler/bolt/SiteMapParserBolt.java
@@ -31,6 +31,7 @@ import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.extension.ExtensionMetadata;
import java.io.IOException;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Calendar;
@@ -38,7 +39,9 @@ import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
+import java.util.TimeZone;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.metric.api.MeanReducer;
import org.apache.storm.metric.api.ReducedMetric;
@@ -73,7 +76,7 @@ public class SiteMapParserBolt extends StatusEmitterBolt {
private static final org.slf4j.Logger LOG =
LoggerFactory.getLogger(SiteMapParserBolt.class);
- private static final byte[] clue = Namespace.SITEMAP.getBytes();
+ private static final byte[] clue =
Namespace.SITEMAP.getBytes(StandardCharsets.UTF_8);
private SiteMapParser parser;
@@ -198,7 +201,7 @@ public class SiteMapParserBolt extends StatusEmitterBolt {
SiteMapIndex smi = (SiteMapIndex) siteMap;
Collection<AbstractSiteMap> subsitemaps = smi.getSitemaps();
- Calendar rightNow = Calendar.getInstance();
+ Calendar rightNow =
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
rightNow.add(Calendar.HOUR, -filterHoursSinceModified);
int delay = 0;
@@ -274,7 +277,8 @@ public class SiteMapParserBolt extends StatusEmitterBolt {
if (lastModified != null) {
// filter based on the published date
if (filterHoursSinceModified != -1) {
- Calendar rightNow = Calendar.getInstance();
+ Calendar rightNow =
+
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
rightNow.add(Calendar.HOUR, -filterHoursSinceModified);
if (lastModified.before(rightNow.getTime())) {
LOG.info(
diff --git
a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
index 43077b8b..671b9c0f 100644
---
a/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
+++
b/core/src/main/java/org/apache/stormcrawler/filtering/regex/FastURLFilter.java
@@ -29,6 +29,7 @@ import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.stormcrawler.JSONResource;
@@ -327,10 +328,10 @@ class Rule {
public Rule(String line) {
int offset = 0;
- String lcline = line.toLowerCase();
+ String lcline = line.toLowerCase(Locale.ROOT);
// separate the type from the pattern
for (Type t : Type.values()) {
- String start = t.toString().toLowerCase() + " ";
+ String start = t.toString().toLowerCase(Locale.ROOT) + " ";
if (lcline.startsWith(start)) {
type = t;
offset = start.length();
diff --git
a/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLNormalizer.java
b/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLNormalizer.java
index 31aa6ac3..4679f8e1 100644
---
a/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLNormalizer.java
+++
b/core/src/main/java/org/apache/stormcrawler/filtering/regex/RegexURLNormalizer.java
@@ -20,6 +20,7 @@ import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.node.ArrayNode;
import java.io.FileNotFoundException;
import java.io.FileReader;
+import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
@@ -236,9 +237,10 @@ public class RegexURLNormalizer extends URLFilter {
* Utility method to test rules against an input. the first arg is the
absolute path of the
* rules file, the second is the URL to be normalised
*/
- public static void main(String[] args) throws FileNotFoundException {
+ public static void main(String[] args) throws FileNotFoundException,
IOException {
RegexURLNormalizer normalizer = new RegexURLNormalizer();
- normalizer.rules = normalizer.readConfiguration(new
FileReader(args[0]));
+ normalizer.rules =
+ normalizer.readConfiguration(new FileReader(args[0],
StandardCharsets.UTF_8));
String output = normalizer.filter(null, null, args[1]);
diff --git a/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java
b/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java
index 53ca12f2..91474006 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/JSoupFilters.java
@@ -21,6 +21,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import org.apache.commons.cli.CommandLine;
@@ -154,7 +155,7 @@ public class JSoupFilters extends AbstractConfigurable
implements JSoupFilter, J
byte[] content = IOUtils.toByteArray((new URL(url)).openStream());
- Document doc = Jsoup.parse(new String(content), url);
+ Document doc = Jsoup.parse(new String(content,
StandardCharsets.UTF_8), url);
filters.filter(url, content, doc, parse);
diff --git a/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java
b/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java
index 83c48afe..e2a0723e 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/ParseFilters.java
@@ -23,6 +23,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import org.apache.commons.cli.CommandLine;
@@ -180,7 +181,7 @@ public class ParseFilters extends ParseFilter implements
JSONResource {
byte[] content = IOUtils.toByteArray((new URL(url)).openStream());
- Document doc = Jsoup.parse(new String(content), url);
+ Document doc = Jsoup.parse(new String(content,
StandardCharsets.UTF_8), url);
filters.filter(url, content, DocumentFragmentBuilder.fromJsoup(doc),
parse);
diff --git
a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
b/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
index 7f79a94d..99fdd16c 100644
--- a/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
+++ b/core/src/main/java/org/apache/stormcrawler/parse/TextExtractor.java
@@ -18,6 +18,7 @@ package org.apache.stormcrawler.parse;
import java.util.HashSet;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import org.apache.stormcrawler.util.ConfUtils;
import org.jetbrains.annotations.Contract;
@@ -74,7 +75,7 @@ public class TextExtractor {
inclusionPatterns = ConfUtils.loadListFromConf(INCLUDE_PARAM_NAME,
stormConf);
excludedTags = new HashSet<String>();
ConfUtils.loadListFromConf(EXCLUDE_PARAM_NAME, stormConf)
- .forEach((s) -> excludedTags.add(s.toLowerCase()));
+ .forEach((s) -> excludedTags.add(s.toLowerCase(Locale.ROOT)));
}
public String text(Element element) {
diff --git
a/core/src/main/java/org/apache/stormcrawler/parse/filter/MimeTypeNormalization.java
b/core/src/main/java/org/apache/stormcrawler/parse/filter/MimeTypeNormalization.java
index 6784398e..3f83da2f 100644
---
a/core/src/main/java/org/apache/stormcrawler/parse/filter/MimeTypeNormalization.java
+++
b/core/src/main/java/org/apache/stormcrawler/parse/filter/MimeTypeNormalization.java
@@ -16,6 +16,7 @@
*/
package org.apache.stormcrawler.parse.filter;
+import java.util.Locale;
import org.apache.storm.shade.org.apache.commons.lang.StringUtils;
import org.apache.stormcrawler.Metadata;
import org.apache.stormcrawler.parse.ParseFilter;
@@ -36,21 +37,21 @@ public class MimeTypeNormalization extends ParseFilter {
String ct = m.getFirstValue("parse.Content-Type");
if (StringUtils.isBlank(ct)) {
ct = "unknown";
- } else if (ct.toLowerCase().contains("html")) {
+ } else if (ct.toLowerCase(Locale.ROOT).contains("html")) {
ct = "html";
- } else if (ct.toLowerCase().contains("pdf")) {
+ } else if (ct.toLowerCase(Locale.ROOT).contains("pdf")) {
ct = "pdf";
- } else if (ct.toLowerCase().contains("word")) {
+ } else if (ct.toLowerCase(Locale.ROOT).contains("word")) {
ct = "word";
- } else if (ct.toLowerCase().contains("excel")) {
+ } else if (ct.toLowerCase(Locale.ROOT).contains("excel")) {
ct = "excel";
- } else if (ct.toLowerCase().contains("powerpoint")) {
+ } else if (ct.toLowerCase(Locale.ROOT).contains("powerpoint")) {
ct = "powerpoint";
- } else if (ct.toLowerCase().startsWith("video/")) {
+ } else if (ct.toLowerCase(Locale.ROOT).startsWith("video/")) {
ct = "video";
- } else if (ct.toLowerCase().startsWith("image/")) {
+ } else if (ct.toLowerCase(Locale.ROOT).startsWith("image/")) {
ct = "image";
- } else if (ct.toLowerCase().startsWith("audio/")) {
+ } else if (ct.toLowerCase(Locale.ROOT).startsWith("audio/")) {
ct = "audio";
} else {
ct = "other";
diff --git
a/core/src/main/java/org/apache/stormcrawler/persistence/AdaptiveScheduler.java
b/core/src/main/java/org/apache/stormcrawler/persistence/AdaptiveScheduler.java
index 61c2330d..32727b04 100644
---
a/core/src/main/java/org/apache/stormcrawler/persistence/AdaptiveScheduler.java
+++
b/core/src/main/java/org/apache/stormcrawler/persistence/AdaptiveScheduler.java
@@ -22,6 +22,7 @@ import java.util.Date;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
+import java.util.TimeZone;
import org.apache.stormcrawler.Constants;
import org.apache.stormcrawler.Metadata;
import org.apache.stormcrawler.parse.filter.MD5SignatureParseFilter;
@@ -224,7 +225,7 @@ public class AdaptiveScheduler extends DefaultScheduler {
return super.schedule(status, metadata);
}
- Calendar now = Calendar.getInstance(Locale.ROOT);
+ Calendar now = Calendar.getInstance(TimeZone.getTimeZone("GMT"),
Locale.ROOT);
String signatureModified =
metadata.getFirstValue(SIGNATURE_MODIFIED_KEY);
diff --git
a/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java
b/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java
index f2e35bff..925ea716 100644
---
a/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java
+++
b/core/src/main/java/org/apache/stormcrawler/persistence/DefaultScheduler.java
@@ -140,7 +140,7 @@ public class DefaultScheduler extends Scheduler {
return Optional.empty();
}
- Calendar cal = Calendar.getInstance();
+ Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"),
Locale.ROOT);
cal.add(Calendar.MINUTE, minutesIncrement);
return Optional.of(cal.getTime());
diff --git
a/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java
b/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java
index 02692055..ee44f8aa 100644
--- a/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java
+++ b/core/src/main/java/org/apache/stormcrawler/protocol/RobotRulesParser.java
@@ -27,6 +27,7 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashSet;
import java.util.List;
+import java.util.Locale;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import javax.security.auth.login.Configuration;
@@ -104,7 +105,7 @@ public abstract class RobotRulesParser {
throw new RuntimeException("Agent name not configured!");
}
- agentName = agentName.toLowerCase();
+ agentName = agentName.toLowerCase(Locale.ROOT);
checkAgentValue(agentName);
ArrayList<String> agents = new ArrayList<>();
@@ -115,13 +116,13 @@ public abstract class RobotRulesParser {
if (configuredAgentNames.size() == 1) {
StringTokenizer tok = new
StringTokenizer(configuredAgentNames.get(0), ",");
while (tok.hasMoreTokens()) {
- String agent = tok.nextToken().trim().toLowerCase();
+ String agent = tok.nextToken().trim().toLowerCase(Locale.ROOT);
checkAgentValue(agent);
agents.add(agent);
}
} else {
for (String ag : configuredAgentNames) {
- String agent = ag.trim().toLowerCase();
+ String agent = ag.trim().toLowerCase(Locale.ROOT);
checkAgentValue(agent);
agents.add(agent);
}
@@ -136,7 +137,7 @@ public abstract class RobotRulesParser {
LOG.info(
"No agents listed in 'http.robots.agents' property! Using
http.agent.name [{}]",
agentName);
- this.agentNames.add(agentName.toLowerCase());
+ this.agentNames.add(agentName.toLowerCase(Locale.ROOT));
} else {
int index = 0;
if ((agents.get(0)).equalsIgnoreCase(agentName)) {
diff --git
a/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java
b/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java
index 24bdfce8..16d6f93f 100644
--- a/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java
+++ b/core/src/main/java/org/apache/stormcrawler/protocol/file/FileResponse.java
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.URL;
import java.net.URLDecoder;
+import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
@@ -139,6 +140,6 @@ public class FileResponse {
}
}
sb.append("</urlset>");
- return new String(sb).getBytes();
+ return new String(sb).getBytes(StandardCharsets.UTF_8);
}
}
diff --git
a/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java
b/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java
index 2fc36f62..6478bb3c 100644
---
a/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java
+++
b/core/src/main/java/org/apache/stormcrawler/protocol/httpclient/HttpProtocol.java
@@ -19,6 +19,7 @@ package org.apache.stormcrawler.protocol.httpclient;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Base64;
import java.util.Collection;
@@ -120,7 +121,9 @@ public class HttpProtocol extends AbstractHttpProtocol
String basicAuthPass = ConfUtils.getString(conf,
"http.basicauth.password", "");
String encoding =
Base64.getEncoder()
- .encodeToString((basicAuthUser + ":" +
basicAuthPass).getBytes());
+ .encodeToString(
+ (basicAuthUser + ":" + basicAuthPass)
+ .getBytes(StandardCharsets.UTF_8));
defaultHeaders.add(new BasicHeader("Authorization", "Basic " +
encoding));
}
diff --git
a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java
b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java
index 3a8cfffa..01902713 100644
---
a/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java
+++
b/core/src/main/java/org/apache/stormcrawler/protocol/okhttp/HttpProtocol.java
@@ -21,6 +21,7 @@ import java.net.InetSocketAddress;
import java.net.MalformedURLException;
import java.net.Proxy;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.security.cert.CertificateException;
import java.util.*;
import java.util.concurrent.TimeUnit;
@@ -200,7 +201,9 @@ public class HttpProtocol extends AbstractHttpProtocol {
final String basicAuthPass = ConfUtils.getString(conf,
"http.basicauth.password", "");
final String encoding =
Base64.getEncoder()
- .encodeToString((basicAuthUser + ":" +
basicAuthPass).getBytes());
+ .encodeToString(
+ (basicAuthUser + ":" + basicAuthPass)
+ .getBytes(StandardCharsets.UTF_8));
customRequestHeaders.add(new KeyValue("Authorization", "Basic " +
encoding));
}
@@ -289,7 +292,7 @@ public class HttpProtocol extends AbstractHttpProtocol {
// format SCProxy into native Java proxy
Proxy proxy =
new Proxy(
-
Proxy.Type.valueOf(prox.getProtocol().toUpperCase()),
+
Proxy.Type.valueOf(prox.getProtocol().toUpperCase(Locale.ROOT)),
new InetSocketAddress(
prox.getAddress(),
Integer.parseInt(prox.getPort())));
@@ -395,7 +398,9 @@ public class HttpProtocol extends AbstractHttpProtocol {
if (key.equals(ProtocolResponse.REQUEST_HEADERS_KEY)
|| key.equals(ProtocolResponse.RESPONSE_HEADERS_KEY)) {
- value = new String(Base64.getDecoder().decode(value));
+ value =
+ new String(
+ Base64.getDecoder().decode(value),
StandardCharsets.ISO_8859_1);
}
responsemetadata.addValue(key.toLowerCase(Locale.ROOT), value);
@@ -569,10 +574,18 @@ public class HttpProtocol extends AbstractHttpProtocol {
responseverbatim.append("\r\n");
final byte[] encodedBytesResponse =
-
Base64.getEncoder().encode(responseverbatim.toString().getBytes());
+ Base64.getEncoder()
+ .encode(
+ responseverbatim
+ .toString()
+
.getBytes(StandardCharsets.ISO_8859_1));
final byte[] encodedBytesRequest =
-
Base64.getEncoder().encode(requestverbatim.toString().getBytes());
+ Base64.getEncoder()
+ .encode(
+ requestverbatim
+ .toString()
+
.getBytes(StandardCharsets.ISO_8859_1));
final StringBuilder protocols = new
StringBuilder(response.protocol().toString());
final Handshake handshake = connection.handshake();
@@ -583,8 +596,12 @@ public class HttpProtocol extends AbstractHttpProtocol {
// returns a modified version of the response
return response.newBuilder()
- .header(ProtocolResponse.REQUEST_HEADERS_KEY, new
String(encodedBytesRequest))
- .header(ProtocolResponse.RESPONSE_HEADERS_KEY, new
String(encodedBytesResponse))
+ .header(
+ ProtocolResponse.REQUEST_HEADERS_KEY,
+ new String(encodedBytesRequest,
StandardCharsets.ISO_8859_1))
+ .header(
+ ProtocolResponse.RESPONSE_HEADERS_KEY,
+ new String(encodedBytesResponse,
StandardCharsets.ISO_8859_1))
.header(ProtocolResponse.RESPONSE_IP_KEY, ipAddress)
.header(ProtocolResponse.REQUEST_TIME_KEY,
Long.toString(startFetchTime))
.header(ProtocolResponse.PROTOCOL_VERSIONS_KEY,
protocols.toString())
diff --git
a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
b/core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
index 21f2d175..23a318b8 100644
---
a/core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
+++
b/core/src/main/java/org/apache/stormcrawler/protocol/selenium/SeleniumProtocol.java
@@ -16,6 +16,7 @@
*/
package org.apache.stormcrawler.protocol.selenium;
+import java.nio.charset.StandardCharsets;
import java.time.Instant;
import java.util.concurrent.LinkedBlockingQueue;
import org.apache.storm.Config;
@@ -77,7 +78,7 @@ public abstract class SeleniumProtocol extends
AbstractHttpProtocol {
outputMeta.addValue(MD_KEY_END, Instant.now().toString());
// if no filters got triggered
- byte[] content = driver.getPageSource().getBytes();
+ byte[] content =
driver.getPageSource().getBytes(StandardCharsets.UTF_8);
return new ProtocolResponse(content, 200, outputMeta);
} finally {
diff --git
a/core/src/main/java/org/apache/stormcrawler/proxy/MultiProxyManager.java
b/core/src/main/java/org/apache/stormcrawler/proxy/MultiProxyManager.java
index fe5e53af..016f06bc 100644
--- a/core/src/main/java/org/apache/stormcrawler/proxy/MultiProxyManager.java
+++ b/core/src/main/java/org/apache/stormcrawler/proxy/MultiProxyManager.java
@@ -17,9 +17,9 @@
package org.apache.stormcrawler.proxy;
import java.io.File;
-import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.URL;
+import java.nio.charset.StandardCharsets;
import java.util.Random;
import java.util.Scanner;
import java.util.concurrent.atomic.AtomicInteger;
@@ -95,7 +95,7 @@ public class MultiProxyManager implements ProxyManager {
// conditionally load file from resources
if (resourcesProxyFilePath != null) {
try {
- scanner = new Scanner(resourcesProxyFilePath.openStream());
+ scanner = new Scanner(resourcesProxyFilePath.openStream(),
StandardCharsets.UTF_8);
} catch (IOException e) {
throw new RuntimeException("failed to load proxy resource
file: " + proxyFile, e);
}
@@ -105,8 +105,8 @@ public class MultiProxyManager implements ProxyManager {
// create new scanner to read file line-by-line
try {
- scanner = new Scanner(proxyFileObj);
- } catch (FileNotFoundException e) {
+ scanner = new Scanner(proxyFileObj, StandardCharsets.UTF_8);
+ } catch (IOException e) {
throw new RuntimeException("failed to load proxy file: " +
proxyFile, e);
}
}
diff --git
a/core/src/main/java/org/apache/stormcrawler/proxy/SingleProxyManager.java
b/core/src/main/java/org/apache/stormcrawler/proxy/SingleProxyManager.java
index c986f304..ff6c8b63 100644
--- a/core/src/main/java/org/apache/stormcrawler/proxy/SingleProxyManager.java
+++ b/core/src/main/java/org/apache/stormcrawler/proxy/SingleProxyManager.java
@@ -16,6 +16,7 @@
*/
package org.apache.stormcrawler.proxy;
+import java.util.Locale;
import org.apache.storm.Config;
import org.apache.stormcrawler.Metadata;
import org.apache.stormcrawler.util.ConfUtils;
@@ -42,7 +43,7 @@ public class SingleProxyManager implements ProxyManager {
String proxyPassword = ConfUtils.getString(conf, "http.proxy.pass",
null);
// assemble proxy connection string
- String proxyString = proxyType.toLowerCase() + "://";
+ String proxyString = proxyType.toLowerCase(Locale.ROOT) + "://";
// conditionally append authentication info
if (proxyUsername != null
@@ -53,7 +54,9 @@ public class SingleProxyManager implements ProxyManager {
}
// complete proxy string and create proxy
- this.proxy = new SCProxy(proxyString + String.format("%s:%d",
proxyHost, proxyPort));
+ this.proxy =
+ new SCProxy(
+ proxyString + String.format(Locale.ROOT, "%s:%d",
proxyHost, proxyPort));
}
@Override
diff --git a/core/src/main/java/org/apache/stormcrawler/spout/FileSpout.java
b/core/src/main/java/org/apache/stormcrawler/spout/FileSpout.java
index 6d5d6f64..773b90a2 100644
--- a/core/src/main/java/org/apache/stormcrawler/spout/FileSpout.java
+++ b/core/src/main/java/org/apache/stormcrawler/spout/FileSpout.java
@@ -233,7 +233,7 @@ public class FileSpout extends BaseRichSpout {
@Override
public void fail(Object msgId) {
if (msgId instanceof byte[]) {
- String msg = new String((byte[]) msgId);
+ String msg = new String((byte[]) msgId, StandardCharsets.UTF_8);
LOG.error("Failed - adding back to the queue: {}", msg);
buffer.add((byte[]) msgId);
} else {
diff --git
a/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java
b/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java
index 85691eed..0dc055ec 100644
--- a/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java
+++ b/core/src/main/java/org/apache/stormcrawler/util/CookieConverter.java
@@ -62,13 +62,13 @@ public class CookieConverter {
for (int i = 1; i < tokens.length; i++) {
String ti = tokens[i].trim();
if (ti.equalsIgnoreCase("secure")) secure = true;
- if (ti.toLowerCase().startsWith("path=")) {
+ if (ti.toLowerCase(Locale.ROOT).startsWith("path=")) {
path = ti.substring(5);
}
- if (ti.toLowerCase().startsWith("domain=")) {
+ if (ti.toLowerCase(Locale.ROOT).startsWith("domain=")) {
domain = ti.substring(7);
}
- if (ti.toLowerCase().startsWith("expires=")) {
+ if (ti.toLowerCase(Locale.ROOT).startsWith("expires=")) {
expires = ti.substring(8);
}
}
diff --git
a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java
b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java
index 3dbc206f..706ef71f 100644
--- a/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java
+++ b/core/src/main/java/org/apache/stormcrawler/util/PerSecondReducer.java
@@ -49,7 +49,7 @@ public class PerSecondReducer implements
IReducer<TimeReducerState> {
double msec = System.currentTimeMillis() - accumulator.started;
if (msec == 0) return 0;
double permsec = accumulator.sum / msec;
- return new Double(permsec * 1000d);
+ return Double.valueOf(permsec * 1000d);
}
}
diff --git
a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
index d1f35cc1..00b0f127 100644
--- a/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
+++ b/core/src/test/java/org/apache/stormcrawler/bolt/JSoupParserBoltTest.java
@@ -17,6 +17,7 @@
package org.apache.stormcrawler.bolt;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@@ -190,7 +191,7 @@ public class JSoupParserBoltTest extends ParsingTester {
for (int i = 0; i < tests.length; i++) {
- byte[] bytes = tests[i].getBytes();
+ byte[] bytes = tests[i].getBytes(StandardCharsets.UTF_8);
parse("http://www.digitalpebble.com", bytes, new Metadata());
diff --git
a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java
b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java
index 940efbd2..e7620fc7 100644
---
a/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java
+++
b/core/src/test/java/org/apache/stormcrawler/filtering/BasicURLNormalizerTest.java
@@ -27,6 +27,7 @@ import java.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import org.apache.stormcrawler.Metadata;
import org.apache.stormcrawler.filtering.basic.BasicURLNormalizer;
@@ -286,13 +287,13 @@ public class BasicURLNormalizerTest {
URL testSourceUrl = new URL("http://blablabla.org/");
String inputURL = "HTTP://www.quanjing.com/";
- String expectedResult = inputURL.toLowerCase();
+ String expectedResult = inputURL.toLowerCase(Locale.ROOT);
String normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(),
inputURL);
assertEquals("Failed to filter query string", expectedResult,
normalizedUrl);
inputURL = "http://www.QUANJING.COM/";
- expectedResult = inputURL.toLowerCase();
+ expectedResult = inputURL.toLowerCase(Locale.ROOT);
normalizedUrl = urlFilter.filter(testSourceUrl, new Metadata(),
inputURL);
assertEquals("Failed to filter query string", expectedResult,
normalizedUrl);
diff --git
a/core/src/test/java/org/apache/stormcrawler/indexer/IndexerTester.java
b/core/src/test/java/org/apache/stormcrawler/indexer/IndexerTester.java
index 7dffe7ba..ddd193bb 100644
--- a/core/src/test/java/org/apache/stormcrawler/indexer/IndexerTester.java
+++ b/core/src/test/java/org/apache/stormcrawler/indexer/IndexerTester.java
@@ -20,6 +20,7 @@ import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.util.Map;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.tuple.Tuple;
@@ -59,7 +60,8 @@ public class IndexerTester {
protected void index(String url, String content, Metadata metadata) throws
IOException {
Tuple tuple = mock(Tuple.class);
- when(tuple.getBinaryByField("content")).thenReturn(content.getBytes());
+ when(tuple.getBinaryByField("content"))
+ .thenReturn(content.getBytes(StandardCharsets.UTF_8));
when(tuple.getStringByField("url")).thenReturn(url);
when(tuple.getValueByField("metadata")).thenReturn(metadata);
bolt.execute(tuple);
diff --git
a/core/src/test/java/org/apache/stormcrawler/persistence/AdaptiveSchedulerTest.java
b/core/src/test/java/org/apache/stormcrawler/persistence/AdaptiveSchedulerTest.java
index 4f51f696..1e7c6e78 100644
---
a/core/src/test/java/org/apache/stormcrawler/persistence/AdaptiveSchedulerTest.java
+++
b/core/src/test/java/org/apache/stormcrawler/persistence/AdaptiveSchedulerTest.java
@@ -25,8 +25,10 @@ import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.HashMap;
+import java.util.Locale;
import java.util.Map;
import java.util.Optional;
+import java.util.TimeZone;
import org.apache.commons.lang.time.DateUtils;
import org.apache.stormcrawler.Metadata;
import org.apache.stormcrawler.protocol.HttpHeaders;
@@ -64,7 +66,7 @@ public class AdaptiveSchedulerTest {
metadata.addValue("fetch.statusCode", "200");
Optional<Date> nextFetch = scheduler.schedule(Status.FETCHED,
metadata);
- Calendar cal = Calendar.getInstance();
+ Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"),
Locale.ROOT);
cal.add(Calendar.MINUTE, 6);
Assert.assertEquals(
DateUtils.round(cal.getTime(), Calendar.SECOND),
@@ -72,7 +74,7 @@ public class AdaptiveSchedulerTest {
nextFetch = scheduler.schedule(Status.ERROR, metadata);
- cal = Calendar.getInstance();
+ cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
cal.add(Calendar.MINUTE, 8);
Assert.assertEquals(
DateUtils.round(cal.getTime(), Calendar.SECOND),
@@ -89,7 +91,11 @@ public class AdaptiveSchedulerTest {
metadata.addValue(AdaptiveScheduler.SIGNATURE_KEY, md5sumEmptyContent);
scheduler.schedule(Status.FETCHED, metadata);
Instant firstFetch =
- DateUtils.round(Calendar.getInstance().getTime(),
Calendar.SECOND).toInstant();
+ DateUtils.round(
+
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT)
+ .getTime(),
+ Calendar.SECOND)
+ .toInstant();
/* verify initial fetch interval and last-modified time */
String lastModified =
metadata.getFirstValue(HttpHeaders.LAST_MODIFIED);
@@ -135,7 +141,11 @@ public class AdaptiveSchedulerTest {
metadata.addValue(AdaptiveScheduler.SIGNATURE_KEY, md5sumSpaceContent);
scheduler.schedule(Status.FETCHED, metadata);
Instant lastFetch =
- DateUtils.round(Calendar.getInstance().getTime(),
Calendar.SECOND).toInstant();
+ DateUtils.round(
+
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT)
+ .getTime(),
+ Calendar.SECOND)
+ .toInstant();
fetchInterval =
metadata.getFirstValue(AdaptiveScheduler.FETCH_INTERVAL_KEY);
Assert.assertNotNull(fetchInterval);
/* interval should now shrink */
diff --git
a/core/src/test/java/org/apache/stormcrawler/persistence/DefaultSchedulerTest.java
b/core/src/test/java/org/apache/stormcrawler/persistence/DefaultSchedulerTest.java
index 451a606a..192ddfc4 100644
---
a/core/src/test/java/org/apache/stormcrawler/persistence/DefaultSchedulerTest.java
+++
b/core/src/test/java/org/apache/stormcrawler/persistence/DefaultSchedulerTest.java
@@ -20,8 +20,10 @@ import java.net.MalformedURLException;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
+import java.util.Locale;
import java.util.Map;
import java.util.Optional;
+import java.util.TimeZone;
import org.apache.commons.lang.time.DateUtils;
import org.apache.stormcrawler.Metadata;
import org.junit.Assert;
@@ -40,7 +42,7 @@ public class DefaultSchedulerTest {
metadata.addValue("testKey", "someValue");
Optional<Date> nextFetch = scheduler.schedule(Status.FETCHED,
metadata);
- Calendar cal = Calendar.getInstance();
+ Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"),
Locale.ROOT);
cal.add(Calendar.MINUTE, 360);
Assert.assertEquals(
DateUtils.round(cal.getTime(), Calendar.SECOND),
@@ -48,7 +50,7 @@ public class DefaultSchedulerTest {
nextFetch = scheduler.schedule(Status.ERROR, metadata);
- cal = Calendar.getInstance();
+ cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
cal.add(Calendar.MINUTE, 3600);
Assert.assertEquals(
DateUtils.round(cal.getTime(), Calendar.SECOND),
@@ -66,7 +68,7 @@ public class DefaultSchedulerTest {
metadata.addValue("testKey.key2", "someValue");
Optional<Date> nextFetch = scheduler.schedule(Status.FETCHED,
metadata);
- Calendar cal = Calendar.getInstance();
+ Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"),
Locale.ROOT);
cal.add(Calendar.MINUTE, 360);
Assert.assertEquals(
DateUtils.round(cal.getTime(), Calendar.SECOND),
diff --git
a/core/src/test/java/org/apache/stormcrawler/proxy/MultiProxyManagerTest.java
b/core/src/test/java/org/apache/stormcrawler/proxy/MultiProxyManagerTest.java
index e587652b..5aa01dab 100644
---
a/core/src/test/java/org/apache/stormcrawler/proxy/MultiProxyManagerTest.java
+++
b/core/src/test/java/org/apache/stormcrawler/proxy/MultiProxyManagerTest.java
@@ -18,6 +18,7 @@ package org.apache.stormcrawler.proxy;
import java.io.FileWriter;
import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import org.apache.storm.Config;
@@ -53,7 +54,7 @@ public class MultiProxyManagerTest {
"sock5://example.com:64000",
};
- FileWriter writer = new FileWriter("/tmp/proxies.txt");
+ FileWriter writer = new FileWriter("/tmp/proxies.txt",
StandardCharsets.UTF_8);
for (String proxyString : proxyStrings) {
writer.write("# fake comment to test" + "\n");
writer.write("// fake comment to test" + "\n");
diff --git
a/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchIndexerBolt.java
b/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchIndexerBolt.java
index 2d8027f5..03618df7 100644
---
a/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchIndexerBolt.java
+++
b/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchIndexerBolt.java
@@ -46,6 +46,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.io.FileUtils;
@@ -73,7 +74,7 @@ public class CloudSearchIndexerBolt extends
AbstractIndexerBolt {
private static final int MAX_SIZE_DOC_BYTES = 1048576;
private static final SimpleDateFormat DATE_FORMAT =
- new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'");
+ new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'", Locale.ROOT);
private AmazonCloudSearchDomainClient client;
diff --git
a/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java
b/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java
index cb912a63..e35916cc 100644
---
a/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java
+++
b/external/aws/src/main/java/org/apache/stormcrawler/aws/bolt/CloudSearchUtils.java
@@ -19,6 +19,7 @@ package org.apache.stormcrawler.aws.bolt;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
+import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.commons.codec.binary.Hex;
@@ -73,7 +74,7 @@ public class CloudSearchUtils {
* @return
*/
public static String cleanFieldName(String name) {
- String lowercase = name.toLowerCase();
+ String lowercase = name.toLowerCase(Locale.ROOT);
lowercase = lowercase.replaceAll("[^a-z_0-9]", "_");
if (lowercase.length() < 3 || lowercase.length() > 64)
throw new RuntimeException("Field name must be between 3 and 64
chars : " + lowercase);
diff --git
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java
index 3014f2c6..3a336783 100644
---
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java
+++
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/metrics/MetricsConsumer.java
@@ -23,6 +23,7 @@ import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
+import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.storm.metric.api.IMetricsConsumer;
@@ -75,7 +76,7 @@ public class MetricsConsumer implements IMetricsConsumer {
indexName = ConfUtils.getString(stormConf,
OSMetricsIndexNameParamName, "metrics");
stormID = context.getStormId();
if (registrationArgument != null) {
- dateFormat = new SimpleDateFormat((String) registrationArgument);
+ dateFormat = new SimpleDateFormat((String) registrationArgument,
Locale.ROOT);
LOG.info("Using date format {}", registrationArgument);
}
try {
diff --git
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java
index 696c2808..8d786260 100644
---
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java
+++
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/AggregationSpout.java
@@ -23,8 +23,10 @@ import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.Locale;
import java.util.Map;
import java.util.Set;
+import java.util.TimeZone;
import java.util.concurrent.TimeUnit;
import org.apache.commons.lang.StringUtils;
import org.apache.storm.spout.SpoutOutputCollector;
@@ -305,16 +307,17 @@ public class AggregationSpout extends AbstractSpout
implements ActionListener<Se
// returned in the query and add to it, unless the previous value is
// within n mins in which case we'll keep it
if (mostRecentDateFound != null && recentDateIncrease >= 0) {
- Calendar potentialNewDate = Calendar.getInstance();
+ Calendar potentialNewDate =
+ Calendar.getInstance(TimeZone.getTimeZone("GMT"),
Locale.ROOT);
potentialNewDate.setTimeInMillis(mostRecentDateFound.toEpochMilli());
potentialNewDate.add(Calendar.MINUTE, recentDateIncrease);
Date oldDate = null;
// check boundaries
if (this.recentDateMinGap > 0) {
- Calendar low = Calendar.getInstance();
+ Calendar low =
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
low.setTime(queryDate);
low.add(Calendar.MINUTE, -recentDateMinGap);
- Calendar high = Calendar.getInstance();
+ Calendar high =
Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT);
high.setTime(queryDate);
high.add(Calendar.MINUTE, recentDateMinGap);
if (high.before(potentialNewDate) ||
low.after(potentialNewDate)) {
diff --git
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java
index 7e52a8f0..1fdfd491 100644
---
a/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java
+++
b/external/opensearch/src/main/java/org/apache/stormcrawler/opensearch/persistence/StatusUpdaterBolt.java
@@ -26,6 +26,7 @@ import java.util.Date;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
@@ -125,13 +126,19 @@ public class StatusUpdaterBolt extends
AbstractStatusUpdaterBolt
indexName =
ConfUtils.getString(
stormConf,
-
String.format(StatusUpdaterBolt.OSStatusIndexNameParamName, OSBoltType),
+ String.format(
+ Locale.ROOT,
+ StatusUpdaterBolt.OSStatusIndexNameParamName,
+ OSBoltType),
"status");
doRouting =
ConfUtils.getBoolean(
stormConf,
-
String.format(StatusUpdaterBolt.OSStatusRoutingParamName, OSBoltType),
+ String.format(
+ Locale.ROOT,
+ StatusUpdaterBolt.OSStatusRoutingParamName,
+ OSBoltType),
false);
partitioner = new URLPartitioner();
@@ -140,7 +147,10 @@ public class StatusUpdaterBolt extends
AbstractStatusUpdaterBolt
fieldNameForRoutingKey =
ConfUtils.getString(
stormConf,
-
String.format(StatusUpdaterBolt.OSStatusRoutingFieldParamName, OSBoltType));
+ String.format(
+ Locale.ROOT,
+
StatusUpdaterBolt.OSStatusRoutingFieldParamName,
+ OSBoltType));
if (StringUtils.isNotBlank(fieldNameForRoutingKey)) {
if (fieldNameForRoutingKey.startsWith("metadata.")) {
routingFieldNameInMetadata = true;
diff --git
a/external/solr/src/main/java/org/apache/stormcrawler/solr/metrics/MetricsConsumer.java
b/external/solr/src/main/java/org/apache/stormcrawler/solr/metrics/MetricsConsumer.java
index 982f562c..637f7d24 100644
---
a/external/solr/src/main/java/org/apache/stormcrawler/solr/metrics/MetricsConsumer.java
+++
b/external/solr/src/main/java/org/apache/stormcrawler/solr/metrics/MetricsConsumer.java
@@ -21,6 +21,7 @@ import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
+import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.solr.common.SolrInputDocument;
@@ -36,7 +37,7 @@ public class MetricsConsumer implements IMetricsConsumer {
private final Logger LOG = LoggerFactory.getLogger(MetricsConsumer.class);
- private final DateFormat df = new
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
+ private final DateFormat df = new
SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", Locale.ROOT);
private static final String BOLT_TYPE = "metrics";
diff --git
a/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/StatusUpdaterBolt.java
b/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/StatusUpdaterBolt.java
index 3f872870..562d03e0 100644
---
a/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/StatusUpdaterBolt.java
+++
b/external/solr/src/main/java/org/apache/stormcrawler/solr/persistence/StatusUpdaterBolt.java
@@ -18,6 +18,7 @@ package org.apache.stormcrawler.solr.persistence;
import java.util.Date;
import java.util.Iterator;
+import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import org.apache.solr.common.SolrInputDocument;
@@ -83,7 +84,7 @@ public class StatusUpdaterBolt extends
AbstractStatusUpdaterBolt {
while (keyIterator.hasNext()) {
String key = keyIterator.next();
String[] values = metadata.getValues(key);
- doc.setField(String.format("%s.%s", mdPrefix, key), values);
+ doc.setField(String.format(Locale.ROOT, "%s.%s", mdPrefix, key),
values);
}
if (nextFetch.isPresent()) {
diff --git
a/external/sql/src/main/java/org/apache/stormcrawler/sql/SQLSpout.java
b/external/sql/src/main/java/org/apache/stormcrawler/sql/SQLSpout.java
index 8207130e..facb1409 100644
--- a/external/sql/src/main/java/org/apache/stormcrawler/sql/SQLSpout.java
+++ b/external/sql/src/main/java/org/apache/stormcrawler/sql/SQLSpout.java
@@ -17,6 +17,7 @@
package org.apache.stormcrawler.sql;
import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
import java.sql.Connection;
import java.sql.ResultSet;
import java.sql.SQLException;
@@ -173,7 +174,8 @@ public class SQLSpout extends AbstractQueryingSpout {
metadata = "\t" + metadata;
}
String URLMD = url + metadata;
- List<Object> v =
SCHEME.deserialize(ByteBuffer.wrap(URLMD.getBytes()));
+ List<Object> v =
+
SCHEME.deserialize(ByteBuffer.wrap(URLMD.getBytes(StandardCharsets.UTF_8)));
buffer.add(url, (Metadata) v.get(1));
}
diff --git
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCFileNameFormat.java
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCFileNameFormat.java
index 097650b0..319ef1cb 100644
---
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCFileNameFormat.java
+++
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCFileNameFormat.java
@@ -18,6 +18,7 @@ package org.apache.stormcrawler.warc;
import java.text.SimpleDateFormat;
import java.util.Date;
+import java.util.Locale;
import java.util.Map;
import java.util.TimeZone;
import org.apache.storm.hdfs.bolt.format.FileNameFormat;
@@ -70,18 +71,18 @@ public class WARCFileNameFormat implements FileNameFormat {
@Override
public String getName(long rotation, long timeStamp) {
- SimpleDateFormat fileDate = new SimpleDateFormat("yyyyMMddHHmmss");
+ SimpleDateFormat fileDate = new SimpleDateFormat("yyyyMMddHHmmss",
Locale.ROOT);
fileDate.setTimeZone(TimeZone.getTimeZone("GMT"));
String taskindexString = "";
if (this.taskIndex != -1) {
- taskindexString = String.format("%02d", this.taskIndex) + "-";
+ taskindexString = String.format(Locale.ROOT, "%02d",
this.taskIndex) + "-";
}
return this.prefix
+ "-"
+ fileDate.format(new Date(timeStamp))
+ "-"
+ taskindexString
- + String.format("%05d", rotation)
+ + String.format(Locale.ROOT, "%05d", rotation)
+ this.extension;
}
diff --git
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRecordFormat.java
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRecordFormat.java
index 2b8bf873..aba12641 100644
---
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRecordFormat.java
+++
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRecordFormat.java
@@ -330,7 +330,7 @@ public class WARCRecordFormat implements RecordFormat {
if (StringUtils.isNotBlank(headersVerbatim)) {
WARCTypeValue = WARC_TYPE_RESPONSE;
headersVerbatim = fixHttpHeaders(headersVerbatim, content.length);
- httpheaders = headersVerbatim.getBytes();
+ httpheaders = headersVerbatim.getBytes(StandardCharsets.UTF_8);
}
StringBuilder buffer = new StringBuilder();
diff --git
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java
index 63a45dbf..c91a21e1 100644
---
a/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java
+++
b/external/warc/src/main/java/org/apache/stormcrawler/warc/WARCRequestRecordFormat.java
@@ -58,7 +58,7 @@ public class WARCRequestRecordFormat extends WARCRecordFormat
{
LOG.warn("No request header for {}", url);
return new byte[] {};
}
- final byte[] httpheaders = fixHttpHeaders(headersVerbatim).getBytes();
+ final byte[] httpheaders =
fixHttpHeaders(headersVerbatim).getBytes(StandardCharsets.UTF_8);
StringBuilder buffer = new StringBuilder();
buffer.append(WARC_VERSION);
diff --git a/pom.xml b/pom.xml
index 375763ee..aa76643d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -338,6 +338,33 @@ under the License.
</execution>
</executions>
</plugin>
+ <plugin>
+ <groupId>de.thetaphi</groupId>
+ <artifactId>forbiddenapis</artifactId>
+ <version>3.7</version>
+ <configuration>
+ <targetVersion>11</targetVersion>
+
<ignoreSignaturesOfMissingClasses>true</ignoreSignaturesOfMissingClasses>
+
<failOnUnsupportedJava>false</failOnUnsupportedJava>
+
<excludes>test-documents/*.class</excludes>
+ <bundledSignatures>
+
<bundledSignature>jdk-unsafe</bundledSignature>
+
<bundledSignature>jdk-deprecated</bundledSignature>
+
<bundledSignature>jdk-non-portable</bundledSignature>
+
<bundledSignature>jdk-internal</bundledSignature>
+ <!-- replace with
${commons.io.version} when available -->
+
<bundledSignature>commons-io-unsafe-2.11.0</bundledSignature>
+ </bundledSignatures>
+ </configuration>
+ <executions>
+ <execution>
+ <goals>
+ <goal>check</goal>
+ <goal>testCheck</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>license-maven-plugin</artifactId>
@@ -485,8 +512,8 @@ under the License.
<type>pom</type>
<scope>import</scope>
</dependency>
-
- <!-- Necessary until https://github.com/apache/storm/pull/3626 -->
+ <!-- Necessary until
https://github.com/apache/storm/pull/3626 -->
+ <!-- update forbidden-apis when you update this -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>