This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
commit e287fe297a113bc172f1414b6c9cf5bdd91af307 Author: TALLISON <[email protected]> AuthorDate: Fri Aug 16 12:53:48 2019 -0400 TIKA-2906 -- cleanup and bug fix on entropy --- .../org/apache/tika/eval/AbstractProfiler.java | 13 ++------ .../apache/tika/eval/textstats/TokenEntropy.java | 2 +- .../EvalExceptionUtils.java} | 35 ++++++++++++---------- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java index becccf3..e23361b 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java @@ -66,6 +66,7 @@ import org.apache.tika.eval.tokens.TokenCounts; import org.apache.tika.eval.tokens.TokenIntPair; import org.apache.tika.eval.util.ContentTagParser; import org.apache.tika.eval.util.ContentTags; +import org.apache.tika.eval.util.EvalExceptionUtils; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.PagedText; @@ -183,10 +184,6 @@ public abstract class AbstractProfiler extends FileResourceConsumer { int maxTokens = 200000; - //these remove runtime info from the stacktraces so - //that actual causes can be counted. - private final static Pattern CAUSED_BY_SNIPPER = - Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+"); private final static Pattern ACCESS_PERMISSION_EXCEPTION = Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException"); @@ -495,7 +492,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { void getExceptionStrings(Metadata metadata, Map<Cols, String> data) { - String fullTrace = metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime"); + String fullTrace = metadata.get(RecursiveParserWrapperHandler.CONTAINER_EXCEPTION); if (fullTrace == null) { fullTrace = metadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_EXCEPTION); @@ -528,11 +525,7 @@ public abstract class AbstractProfiler extends FileResourceConsumer { //IOException from org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee //For reporting purposes, let's snip off the object id so that we can more //easily count exceptions. - String sortTrace = ExceptionUtils.trimMessage(fullTrace); - - matcher = CAUSED_BY_SNIPPER.matcher(sortTrace); - sortTrace = matcher.replaceAll("$1"); - sortTrace = sortTrace.replaceAll("org.apache.tika.", "o.a.t."); + String sortTrace = EvalExceptionUtils.normalize(fullTrace); data.put(Cols.SORT_STACK_TRACE, sortTrace); } } diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java index e9af315..dd538b0 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java @@ -34,6 +34,6 @@ public class TokenEntropy implements TokenCountStatsCalculator<Double> { p = (double) termFreq / totalTokens; ent += p * FastMath.log(base, p); } - return ent; + return -1.0*ent; } } diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java b/tika-eval/src/main/java/org/apache/tika/eval/util/EvalExceptionUtils.java similarity index 50% copy from tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java copy to tika-eval/src/main/java/org/apache/tika/eval/util/EvalExceptionUtils.java index e9af315..ecad26f 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TokenEntropy.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/util/EvalExceptionUtils.java @@ -14,26 +14,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.eval.textstats; +package org.apache.tika.eval.util; -import org.apache.commons.lang3.mutable.MutableInt; -import org.apache.commons.math3.util.FastMath; -import org.apache.tika.eval.tokens.TokenCounts; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -public class TokenEntropy implements TokenCountStatsCalculator<Double> { +import org.apache.commons.lang3.StringUtils; +import org.apache.tika.utils.ExceptionUtils; - @Override - public Double calculate(TokenCounts tokenCounts) { - double ent = 0.0d; - double p = 0.0d; - double base = 2.0; - double totalTokens = (double)tokenCounts.getTotalTokens(); - for (MutableInt i : tokenCounts.getTokens().values()) { - int termFreq = i.intValue(); +public class EvalExceptionUtils { - p = (double) termFreq / totalTokens; - ent += p * FastMath.log(base, p); + //these remove runtime info from the stacktraces so + //that actual causes can be counted. + private final static Pattern CAUSED_BY_SNIPPER = + Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+"); + + public static String normalize(String stacktrace) { + if (StringUtils.isBlank(stacktrace)) { + return ""; } - return ent; + String sortTrace = ExceptionUtils.trimMessage(stacktrace); + + Matcher matcher = CAUSED_BY_SNIPPER.matcher(sortTrace); + sortTrace = matcher.replaceAll("$1"); + return sortTrace.replaceAll("org.apache.tika.", "o.a.t."); } }
