This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch issues/1248
in repository https://gitbox.apache.org/repos/asf/incubator-stormcrawler.git

commit 809d719e957ae2e62cb5a6fa37cb7c28e2d0f20a
Author: Richard Zowalla <[email protected]>
AuthorDate: Fri Jul 5 13:28:13 2024 +0200

    #1248 - Use pre-compiled patterns for mime type matching in TikaParser
---
 .../java/org/apache/stormcrawler/tika/ParserBolt.java | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git 
a/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java 
b/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java
index 77314050..422615fd 100644
--- a/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java
+++ b/external/tika/src/main/java/org/apache/stormcrawler/tika/ParserBolt.java
@@ -23,6 +23,7 @@ import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.*;
+import java.util.regex.Pattern;
 import org.apache.commons.lang.StringUtils;
 import org.apache.html.dom.HTMLDocumentImpl;
 import org.apache.storm.metric.api.MultiCountMetric;
@@ -88,7 +89,7 @@ public class ParserBolt extends BaseRichBolt {
     private boolean emitOutlinks = true;
 
     /** regular expressions to apply to the mime-type * */
-    private List<String> mimeTypeWhiteList = new LinkedList<>();
+    private List<Pattern> mimeTypeWhiteList = new LinkedList<>();
 
     private String protocolMDprefix;
 
@@ -121,7 +122,15 @@ public class ParserBolt extends BaseRichBolt {
             throw e;
         }
 
-        mimeTypeWhiteList = 
ConfUtils.loadListFromConf("parser.mimetype.whitelist", conf);
+        final List<String> mimeTypeWhiteListStrings =
+                ConfUtils.loadListFromConf("parser.mimetype.whitelist", conf);
+        for (String mt : mimeTypeWhiteListStrings) {
+            try {
+                this.mimeTypeWhiteList.add(Pattern.compile(mt));
+            } catch (RuntimeException e) {
+                LOG.warn("Failed to compile whitelist regex: {}", mt);
+            }
+        }
 
         protocolMDprefix = ConfUtils.getString(conf, 
ProtocolResponse.PROTOCOL_MD_PREFIX_PARAM, "");
 
@@ -145,7 +154,7 @@ public class ParserBolt extends BaseRichBolt {
         Metadata metadata = (Metadata) tuple.getValueByField("metadata");
 
         // check that the mimetype is in the whitelist
-        if (mimeTypeWhiteList.size() > 0) {
+        if (!mimeTypeWhiteList.isEmpty()) {
             boolean mt_match = false;
             // see if a mimetype was guessed in JSOUPBolt
             String mimeType = metadata.getFirstValue("parse.Content-Type");
@@ -154,8 +163,8 @@ public class ParserBolt extends BaseRichBolt {
                 mimeType = metadata.getFirstValue(HttpHeaders.CONTENT_TYPE, 
this.protocolMDprefix);
             }
             if (mimeType != null) {
-                for (String mt : mimeTypeWhiteList) {
-                    if (mimeType.matches(mt)) {
+                for (Pattern mt : mimeTypeWhiteList) {
+                    if (mt.matcher(mimeType).matches()) {
                         mt_match = true;
                         break;
                     }

Reply via email to