This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e287fe297a113bc172f1414b6c9cf5bdd91af307
Author: TALLISON <[email protected]>
AuthorDate: Fri Aug 16 12:53:48 2019 -0400

    TIKA-2906 -- cleanup and bug fix on entropy
---
 .../org/apache/tika/eval/AbstractProfiler.java     | 13 ++------
 .../apache/tika/eval/textstats/TokenEntropy.java   |  2 +-
 .../EvalExceptionUtils.java}                       | 35 ++++++++++++----------
 3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java 
b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
index becccf3..e23361b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -66,6 +66,7 @@ import org.apache.tika.eval.tokens.TokenCounts;
 import org.apache.tika.eval.tokens.TokenIntPair;
 import org.apache.tika.eval.util.ContentTagParser;
 import org.apache.tika.eval.util.ContentTags;
+import org.apache.tika.eval.util.EvalExceptionUtils;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.PagedText;
@@ -183,10 +184,6 @@ public abstract class AbstractProfiler extends 
FileResourceConsumer {
     int maxTokens = 200000;
 
 
-    //these remove runtime info from the stacktraces so
-    //that actual causes can be counted.
-    private final static Pattern CAUSED_BY_SNIPPER =
-            Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+");
 
     private final static Pattern ACCESS_PERMISSION_EXCEPTION =
             
Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException");
@@ -495,7 +492,7 @@ public abstract class AbstractProfiler extends 
FileResourceConsumer {
 
     void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {
 
-        String fullTrace = 
metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime");
+        String fullTrace = 
metadata.get(RecursiveParserWrapperHandler.CONTAINER_EXCEPTION);
 
         if (fullTrace == null) {
             fullTrace = 
metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION);
@@ -528,11 +525,7 @@ public abstract class AbstractProfiler extends 
FileResourceConsumer {
             //IOException from 
org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee
             //For reporting purposes, let's snip off the object id so that we 
can more
             //easily count exceptions.
-            String sortTrace = ExceptionUtils.trimMessage(fullTrace);
-
-            matcher = CAUSED_BY_SNIPPER.matcher(sortTrace);
-            sortTrace = matcher.replaceAll("$1");
-            sortTrace = sortTrace.replaceAll("org.apache.tika.", "o.a.t.");
+            String sortTrace = EvalExceptionUtils.normalize(fullTrace);
             data.put(Cols.SORT_STACK_TRACE, sortTrace);
         }
     }
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java
index e9af315..dd538b0 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java
@@ -34,6 +34,6 @@ public class TokenEntropy implements 
TokenCountStatsCalculator<Double> {
             p = (double) termFreq / totalTokens;
             ent += p * FastMath.log(base, p);
         }
-        return ent;
+        return -1.0*ent;
     }
 }
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java 
b/tika-eval/src/main/java/org/apache/tika/eval/util/EvalExceptionUtils.java
similarity index 50%
copy from 
tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java
copy to 
tika-eval/src/main/java/org/apache/tika/eval/util/EvalExceptionUtils.java
index e9af315..ecad26f 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/util/EvalExceptionUtils.java
@@ -14,26 +14,29 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.tika.eval.textstats;
+package org.apache.tika.eval.util;
 
-import org.apache.commons.lang3.mutable.MutableInt;
-import org.apache.commons.math3.util.FastMath;
-import org.apache.tika.eval.tokens.TokenCounts;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
-public class TokenEntropy implements TokenCountStatsCalculator<Double> {
+import org.apache.commons.lang3.StringUtils;
+import org.apache.tika.utils.ExceptionUtils;
 
-    @Override
-    public Double calculate(TokenCounts tokenCounts) {
-        double ent = 0.0d;
-        double p = 0.0d;
-        double base = 2.0;
-        double totalTokens = (double)tokenCounts.getTotalTokens();
-        for (MutableInt i : tokenCounts.getTokens().values()) {
-            int termFreq = i.intValue();
+public class EvalExceptionUtils {
 
-            p = (double) termFreq / totalTokens;
-            ent += p * FastMath.log(base, p);
+    //these remove runtime info from the stacktraces so
+    //that actual causes can be counted.
+    private final static Pattern CAUSED_BY_SNIPPER =
+            Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+");
+
+    public static String normalize(String stacktrace) {
+        if (StringUtils.isBlank(stacktrace)) {
+            return "";
         }
-        return ent;
+        String sortTrace = ExceptionUtils.trimMessage(stacktrace);
+
+        Matcher matcher = CAUSED_BY_SNIPPER.matcher(sortTrace);
+        sortTrace = matcher.replaceAll("$1");
+        return sortTrace.replaceAll("org.apache.tika.", "o.a.t.");
     }
 }

Reply via email to