This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch issues/1248 in repository https://gitbox.apache.org/repos/asf/incubator-stormcrawler.git
commit 809d719e957ae2e62cb5a6fa37cb7c28e2d0f20a Author: Richard Zowalla <[email protected]> AuthorDate: Fri Jul 5 13:28:13 2024 +0200 #1248 - Use pre-compiled patterns for mime type matching in TikaParser --- .../java/org/apache/stormcrawler/tika/ParserBolt.java | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java b/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java index 77314050..422615fd 100644 --- a/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java +++ b/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.*; +import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.html.dom.HTMLDocumentImpl; import org.apache.storm.metric.api.MultiCountMetric; @@ -88,7 +89,7 @@ public class ParserBolt extends BaseRichBolt { private boolean emitOutlinks = true; /** regular expressions to apply to the mime-type * */ - private List<String> mimeTypeWhiteList = new LinkedList<>(); + private List<Pattern> mimeTypeWhiteList = new LinkedList<>(); private String protocolMDprefix; @@ -121,7 +122,15 @@ public class ParserBolt extends BaseRichBolt { throw e; } - mimeTypeWhiteList = ConfUtils.loadListFromConf("parser.mimetype.whitelist", conf); + final List<String> mimeTypeWhiteListStrings = + ConfUtils.loadListFromConf("parser.mimetype.whitelist", conf); + for (String mt : mimeTypeWhiteListStrings) { + try { + this.mimeTypeWhiteList.add(Pattern.compile(mt)); + } catch (RuntimeException e) { + LOG.warn("Failed to compile whitelist regex: {}", mt); + } + } protocolMDprefix = ConfUtils.getString(conf, ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, ""); @@ -145,7 +154,7 @@ public class ParserBolt extends BaseRichBolt { Metadata metadata = (Metadata) tuple.getValueByField("metadata"); // check that the mimetype is in the whitelist - if (mimeTypeWhiteList.size() > 0) { + if (!mimeTypeWhiteList.isEmpty()) { boolean mt_match = false; // see if a mimetype was guessed in JSOUPBolt String mimeType = metadata.getFirstValue("parse.Content-Type"); @@ -154,8 +163,8 @@ public class ParserBolt extends BaseRichBolt { mimeType = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE, this.protocolMDprefix); } if (mimeType != null) { - for (String mt : mimeTypeWhiteList) { - if (mimeType.matches(mt)) { + for (Pattern mt : mimeTypeWhiteList) { + if (mt.matcher(mimeType).matches()) { mt_match = true; break; }
