This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4532 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 7a407b88a114a48d13c4da1e1e54a62d5742efcb Author: tallison <[email protected]> AuthorDate: Wed Oct 29 14:19:18 2025 -0400 TIKA-4532 -- remove lang3 --- tika-bundles/tika-bundle-standard/pom.xml | 1 - .../java/org/apache/tika/utils/StringUtils.java | 17 +++++++++ .../org/apache/tika/eval/app/ProfilerBase.java | 22 ++++++------ tika-eval/tika-eval-core/pom.xml | 4 --- .../eval/core/metadata/TikaEvalMetadataFilter.java | 5 ++- .../tika/eval/core/textstats/CommonTokens.java | 13 +++---- .../core/textstats/CommonTokensBhattacharyya.java | 7 ++-- .../eval/core/textstats/CommonTokensCosine.java | 9 +++-- .../eval/core/textstats/CommonTokensHellinger.java | 7 ++-- .../eval/core/textstats/CommonTokensKLDNormed.java | 9 +++-- .../core/textstats/CommonTokensKLDivergence.java | 9 +++-- .../{TokenEntropy.java => LangModelPair.java} | 22 ++---------- .../eval/core/textstats/TextProfileSignature.java | 2 +- .../tika/eval/core/textstats/TokenEntropy.java | 2 +- .../tika/eval/core/textstats/TokenLengths.java | 2 +- .../tika/eval/core/textstats/TopNTokens.java | 2 +- .../eval/core/textstats/UnicodeBlockCounter.java | 3 +- .../eval/core/tokens/CommonTokenCountManager.java | 7 ++-- .../tika/eval/core/tokens/TokenContraster.java | 3 +- .../apache/tika/eval/core/tokens/TokenCounter.java | 2 +- .../apache/tika/eval/core/tokens/TokenCounts.java | 2 +- .../tika/eval/core/util/EvalExceptionUtils.java | 3 +- .../TokenEntropy.java => util/MutableInt.java} | 41 +++++++++++++--------- .../tika/eval/core/tokens/TokenCounterTest.java | 3 +- .../apache/tika/pipes/grpc/TikaGrpcServerImpl.java | 4 +-- tika-parent/pom.xml | 6 ---- .../org/apache/tika/parser/dwg/DWGReadParser.java | 6 ++-- .../tika-parser-code-module/pom.xml | 5 --- .../executable/UniversalExecutableParser.java | 16 +++++---- .../tika-parser-microsoft-module/pom.xml | 4 --- .../parser/microsoft/onenote/OneNoteDocument.java | 10 +++--- .../parser/microsoft/onenote/OneNoteParser.java | 3 +- .../microsoft/onenote/OneNoteTreeWalker.java | 11 +++--- .../tika/parser/microsoft/onenote/RoleGuid.java | 4 +++ .../streamobj/chunking/ZipFilesChunking.java | 3 +- .../microsoft/onenote/OneNoteParserTest.java | 4 +-- .../tika-parser-miscoffice-module/pom.xml | 4 --- .../org/apache/tika/parser/mif/MIFExtractor.java | 7 ++-- .../tika-parser-ocr-module/pom.xml | 4 --- tika-server/tika-server-core/pom.xml | 4 --- tika-server/tika-server-standard/pom.xml | 1 - 41 files changed, 128 insertions(+), 165 deletions(-) diff --git a/tika-bundles/tika-bundle-standard/pom.xml b/tika-bundles/tika-bundle-standard/pom.xml index c6522f276..20f38620d 100644 --- a/tika-bundles/tika-bundle-standard/pom.xml +++ b/tika-bundles/tika-bundle-standard/pom.xml @@ -173,7 +173,6 @@ xmlbeans| jackcess| jackcess-encrypt| - commons-lang3| jsoup| asm| juniversalchardet| diff --git a/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java b/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java index b09963d46..f88f0eef0 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java @@ -44,6 +44,23 @@ public class StringUtils { return s == null || s.isBlank(); } + public static boolean isNotBlank(final String s) { + return ! isBlank(s); + } + + public static boolean startsWithIgnoreCase(String str, String prefix) { + if (str == null || prefix == null) { + return str != null && prefix.length() == 0; + } + + if (str.length() < prefix.length()) { + return false; + } + + return str.regionMatches(true, 0, prefix, 0, prefix.length()); + } + + /** * <p>Left pad a String with a specified String.</p> * diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java index 18e30a5e4..904fadb23 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java @@ -34,8 +34,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.FilenameUtils; -import org.apache.commons.lang3.mutable.MutableInt; -import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -64,6 +62,7 @@ import org.apache.tika.eval.core.tokens.TokenIntPair; import org.apache.tika.eval.core.util.ContentTagParser; import org.apache.tika.eval.core.util.ContentTags; import org.apache.tika.eval.core.util.EvalExceptionUtils; +import org.apache.tika.eval.core.util.MutableInt; import org.apache.tika.exception.TikaException; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.Metadata; @@ -616,15 +615,15 @@ public abstract class ProfilerBase { void unicodeBlocks(Map<Class, Object> tokenStats, Map<Cols, String> data) { Map<String, MutableInt> blocks = (Map<String, MutableInt>) tokenStats.get(UnicodeBlockCounter.class); - List<Pair<String, Integer>> pairs = new ArrayList<>(); + List<FeatureCount> pairs = new ArrayList<>(); for (Map.Entry<String, MutableInt> e : blocks.entrySet()) { - pairs.add(Pair.of(e.getKey(), e + pairs.add(new FeatureCount(e.getKey(), e .getValue() .intValue())); } - pairs.sort((o1, o2) -> o2 - .getValue() - .compareTo(o1.getValue())); + pairs.sort((o1, o2) -> + Integer.compare(o2.count, o1.count) + ); StringBuilder sb = new StringBuilder(); for (int i = 0; i < 20 && i < pairs.size(); i++) { @@ -633,12 +632,10 @@ public abstract class ProfilerBase { } sb .append(pairs - .get(i) - .getKey()) + .get(i).feature) .append(": ") .append(pairs - .get(i) - .getValue()); + .get(i).feature); } data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString()); } @@ -810,6 +807,9 @@ public abstract class ProfilerBase { OOM, TIMEOUT } + private record FeatureCount(String feature, int count) { + + }; } diff --git a/tika-eval/tika-eval-core/pom.xml b/tika-eval/tika-eval-core/pom.xml index 140442ef3..16cb4f778 100644 --- a/tika-eval/tika-eval-core/pom.xml +++ b/tika-eval/tika-eval-core/pom.xml @@ -64,10 +64,6 @@ <groupId>org.apache.lucene</groupId> <artifactId>lucene-analysis-icu</artifactId> </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java index 811958af4..99b26228e 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java @@ -20,8 +20,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; -import org.apache.commons.lang3.StringUtils; - import org.apache.tika.eval.core.langid.LanguageIDWrapper; import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator; import org.apache.tika.eval.core.textstats.CommonTokens; @@ -35,6 +33,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.filter.MetadataFilter; +import org.apache.tika.utils.StringUtils; public class TikaEvalMetadataFilter extends MetadataFilter { @@ -75,7 +74,7 @@ public class TikaEvalMetadataFilter extends MetadataFilter { @Override public void filter(Metadata metadata) throws TikaException { String content = metadata.get(TikaCoreProperties.TIKA_CONTENT); - if (StringUtils.isAllBlank(content)) { + if (StringUtils.isBlank(content)) { return; } calcStats(content, metadata); diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokens.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokens.java index dbdd4a67d..d9e39761b 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokens.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokens.java @@ -20,14 +20,11 @@ import java.util.List; import java.util.Map; import java.util.Set; -import org.apache.commons.lang3.mutable.MutableInt; -import org.apache.commons.lang3.tuple.Pair; - import org.apache.tika.eval.core.tokens.AlphaIdeographFilterFactory; import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.CommonTokenResult; -import org.apache.tika.eval.core.tokens.LangModel; import org.apache.tika.eval.core.tokens.TokenCounts; +import org.apache.tika.eval.core.util.MutableInt; import org.apache.tika.language.detect.LanguageResult; public class CommonTokens implements LanguageAwareTokenCountStats<CommonTokenResult> { @@ -44,10 +41,10 @@ public class CommonTokens implements LanguageAwareTokenCountStats<CommonTokenRes @Override public CommonTokenResult calculate(List<LanguageResult> languages, TokenCounts tokenCounts) { - Pair<String, LangModel> pair = + LangModelPair pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); - String actualLangCode = pair.getKey(); - Set<String> commonTokens = pair.getValue().getTokens(); + + Set<String> commonTokens = pair.langModel().getTokens(); int numUniqueCommonTokens = 0; int numCommonTokens = 0; int numUniqueAlphabeticTokens = 0; @@ -65,7 +62,7 @@ public class CommonTokens implements LanguageAwareTokenCountStats<CommonTokenRes } } - return new CommonTokenResult(actualLangCode, numUniqueCommonTokens, numCommonTokens, + return new CommonTokenResult(pair.lang(), numUniqueCommonTokens, numCommonTokens, numUniqueAlphabeticTokens, numAlphabeticTokens); } } diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensBhattacharyya.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensBhattacharyya.java index f23c17252..9fe075857 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensBhattacharyya.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensBhattacharyya.java @@ -19,13 +19,12 @@ package org.apache.tika.eval.core.textstats; import java.util.List; import java.util.Map; -import org.apache.commons.lang3.mutable.MutableInt; -import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.util.FastMath; import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.LangModel; import org.apache.tika.eval.core.tokens.TokenCounts; +import org.apache.tika.eval.core.util.MutableInt; import org.apache.tika.language.detect.LanguageResult; public class CommonTokensBhattacharyya implements LanguageAwareTokenCountStats<Double> { @@ -38,9 +37,9 @@ public class CommonTokensBhattacharyya implements LanguageAwareTokenCountStats<D @Override public Double calculate(List<LanguageResult> languages, TokenCounts tokenCounts) { - Pair<String, LangModel> pair = + LangModelPair pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); - LangModel model = pair.getValue(); + LangModel model = pair.langModel(); double sum = 0.0; if (tokenCounts.getTokens().entrySet().size() == 0) { return 0.0; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensCosine.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensCosine.java index d0b275249..b0d7c3f9a 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensCosine.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensCosine.java @@ -20,13 +20,12 @@ import java.util.Collection; import java.util.List; import java.util.Map; -import org.apache.commons.lang3.mutable.MutableInt; -import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.util.FastMath; import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.LangModel; import org.apache.tika.eval.core.tokens.TokenCounts; +import org.apache.tika.eval.core.util.MutableInt; import org.apache.tika.language.detect.LanguageResult; public class CommonTokensCosine implements LanguageAwareTokenCountStats<Double> { @@ -39,11 +38,11 @@ public class CommonTokensCosine implements LanguageAwareTokenCountStats<Double> @Override public Double calculate(List<LanguageResult> languages, TokenCounts tokenCounts) { - Pair<String, LangModel> pair = + LangModelPair pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); - LangModel model = pair.getValue(); + LangModel model = pair.langModel(); double kl = 0.0; - if (tokenCounts.getTokens().entrySet().size() == 0) { + if (tokenCounts.getTokens().isEmpty()) { return 1.0; } double numerator = 0.0; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensHellinger.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensHellinger.java index cbbcacc46..fb221aacc 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensHellinger.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensHellinger.java @@ -19,13 +19,12 @@ package org.apache.tika.eval.core.textstats; import java.util.List; import java.util.Map; -import org.apache.commons.lang3.mutable.MutableInt; -import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.util.FastMath; import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.LangModel; import org.apache.tika.eval.core.tokens.TokenCounts; +import org.apache.tika.eval.core.util.MutableInt; import org.apache.tika.language.detect.LanguageResult; public class CommonTokensHellinger implements LanguageAwareTokenCountStats<Double> { @@ -38,9 +37,9 @@ public class CommonTokensHellinger implements LanguageAwareTokenCountStats<Doubl @Override public Double calculate(List<LanguageResult> languages, TokenCounts tokenCounts) { - Pair<String, LangModel> pair = + LangModelPair pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); - LangModel model = pair.getValue(); + LangModel model = pair.langModel(); double sum = 0.0; if (tokenCounts.getTokens().entrySet().size() == 0) { return 0.0; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDNormed.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDNormed.java index ec0a99ba7..1922f6603 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDNormed.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDNormed.java @@ -19,13 +19,12 @@ package org.apache.tika.eval.core.textstats; import java.util.List; import java.util.Map; -import org.apache.commons.lang3.mutable.MutableInt; -import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.util.FastMath; import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.LangModel; import org.apache.tika.eval.core.tokens.TokenCounts; +import org.apache.tika.eval.core.util.MutableInt; import org.apache.tika.language.detect.LanguageResult; public class CommonTokensKLDNormed implements LanguageAwareTokenCountStats<Double> { @@ -38,11 +37,11 @@ public class CommonTokensKLDNormed implements LanguageAwareTokenCountStats<Doubl @Override public Double calculate(List<LanguageResult> languages, TokenCounts tokenCounts) { - Pair<String, LangModel> pair = + LangModelPair pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); - LangModel model = pair.getValue(); + LangModel model = pair.langModel(); double kl = 0.0; - if (tokenCounts.getTokens().entrySet().size() == 0) { + if (tokenCounts.getTokens().isEmpty()) { return 1.0; } double worstCase = 0.0; diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDivergence.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDivergence.java index 59d31aeb9..8bfb3a034 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDivergence.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/CommonTokensKLDivergence.java @@ -19,13 +19,12 @@ package org.apache.tika.eval.core.textstats; import java.util.List; import java.util.Map; -import org.apache.commons.lang3.mutable.MutableInt; -import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.math3.util.FastMath; import org.apache.tika.eval.core.tokens.CommonTokenCountManager; import org.apache.tika.eval.core.tokens.LangModel; import org.apache.tika.eval.core.tokens.TokenCounts; +import org.apache.tika.eval.core.util.MutableInt; import org.apache.tika.language.detect.LanguageResult; public class CommonTokensKLDivergence implements LanguageAwareTokenCountStats<Double> { @@ -38,11 +37,11 @@ public class CommonTokensKLDivergence implements LanguageAwareTokenCountStats<Do @Override public Double calculate(List<LanguageResult> languages, TokenCounts tokenCounts) { - Pair<String, LangModel> pair = + LangModelPair pair = commonTokenCountManager.getLangTokens(languages.get(0).getLanguage()); - LangModel model = pair.getValue(); + LangModel model = pair.langModel(); double kl = 0.0; - if (tokenCounts.getTokens().entrySet().size() == 0) { + if (tokenCounts.getTokens().isEmpty()) { return 1.0; } for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()) { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/LangModelPair.java similarity index 55% copy from tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java copy to tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/LangModelPair.java index ebb2d0a5f..1f7591634 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/LangModelPair.java @@ -16,25 +16,7 @@ */ package org.apache.tika.eval.core.textstats; -import org.apache.commons.lang3.mutable.MutableInt; -import org.apache.commons.math3.util.FastMath; +import org.apache.tika.eval.core.tokens.LangModel; -import org.apache.tika.eval.core.tokens.TokenCounts; - -public class TokenEntropy implements TokenCountStatsCalculator<Double> { - - @Override - public Double calculate(TokenCounts tokenCounts) { - double ent = 0.0d; - double p = 0.0d; - double base = 2.0; - double totalTokens = (double) tokenCounts.getTotalTokens(); - for (MutableInt i : tokenCounts.getTokens().values()) { - int termFreq = i.intValue(); - - p = (double) termFreq / totalTokens; - ent += p * FastMath.log(base, p); - } - return -1.0 * ent; - } +public record LangModelPair(String lang, LangModel langModel) { } diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextProfileSignature.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextProfileSignature.java index 9f726c93a..60ceff5fb 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextProfileSignature.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TextProfileSignature.java @@ -23,9 +23,9 @@ import java.util.Map; import org.apache.commons.codec.binary.Base32; import org.apache.commons.codec.digest.DigestUtils; -import org.apache.commons.lang3.mutable.MutableInt; import org.apache.tika.eval.core.tokens.TokenCounts; +import org.apache.tika.eval.core.util.MutableInt; /** * Copied nearly directly from Apache Nutch: diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java index ebb2d0a5f..fd7b32225 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java @@ -16,10 +16,10 @@ */ package org.apache.tika.eval.core.textstats; -import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.math3.util.FastMath; import org.apache.tika.eval.core.tokens.TokenCounts; +import org.apache.tika.eval.core.util.MutableInt; public class TokenEntropy implements TokenCountStatsCalculator<Double> { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenLengths.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenLengths.java index c98eb06d7..7884e5530 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenLengths.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenLengths.java @@ -18,10 +18,10 @@ package org.apache.tika.eval.core.textstats; import java.util.Map; -import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.tika.eval.core.tokens.TokenCounts; +import org.apache.tika.eval.core.util.MutableInt; public class TokenLengths implements TokenCountStatsCalculator<SummaryStatistics> { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TopNTokens.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TopNTokens.java index faf74dea2..e27d41d69 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TopNTokens.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TopNTokens.java @@ -18,11 +18,11 @@ package org.apache.tika.eval.core.textstats; import java.util.Map; -import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.tika.eval.core.tokens.TokenCounts; import org.apache.tika.eval.core.tokens.TokenIntPair; +import org.apache.tika.eval.core.util.MutableInt; public class TopNTokens implements TokenCountStatsCalculator<TokenIntPair[]> { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/UnicodeBlockCounter.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/UnicodeBlockCounter.java index c02852e88..7cd4b99d8 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/UnicodeBlockCounter.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/UnicodeBlockCounter.java @@ -23,10 +23,11 @@ import java.util.Collections; import java.util.HashMap; import java.util.Map; -import org.apache.commons.lang3.mutable.MutableInt; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.eval.core.util.MutableInt; + public class UnicodeBlockCounter implements StringStatsCalculator<Map<String, MutableInt>> { private static final Logger LOG = LoggerFactory.getLogger(UnicodeBlockCounter.class); diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenCountManager.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenCountManager.java index 696890d04..624af9c3d 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenCountManager.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/CommonTokenCountManager.java @@ -34,10 +34,11 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.tuple.Pair; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.tika.eval.core.textstats.LangModelPair; + public class CommonTokenCountManager { private static final Logger LOG = LoggerFactory.getLogger(CommonTokenCountManager.class); @@ -90,9 +91,9 @@ public class CommonTokenCountManager { * @return pair of actual language code used and a set of common * tokens for that language */ - public Pair<String, LangModel> getLangTokens(String lang) { + public LangModelPair getLangTokens(String lang) { String actualLangCode = getActualLangCode(lang); - return Pair.of(actualLangCode, commonTokenMap.get(actualLangCode)); + return new LangModelPair(actualLangCode, commonTokenMap.get(actualLangCode)); } //return langcode for lang that you are actually using diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenContraster.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenContraster.java index b61862511..cca36d712 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenContraster.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenContraster.java @@ -19,9 +19,10 @@ package org.apache.tika.eval.core.tokens; import java.util.Map; -import org.apache.commons.lang3.mutable.MutableInt; import org.apache.lucene.util.PriorityQueue; +import org.apache.tika.eval.core.util.MutableInt; + /** * Computes some corpus contrast statistics. * <p> diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounter.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounter.java index bd9cf9c3a..08b752989 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounter.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounter.java @@ -21,7 +21,6 @@ import java.util.Collections; import java.util.HashMap; import java.util.Map; -import org.apache.commons.lang3.mutable.MutableInt; import org.apache.commons.math3.stat.descriptive.SummaryStatistics; import org.apache.commons.math3.util.FastMath; import org.apache.lucene.analysis.Analyzer; @@ -33,6 +32,7 @@ import org.apache.tika.eval.core.textstats.TokenCountPriorityQueue; import org.apache.tika.eval.core.textstats.TokenEntropy; import org.apache.tika.eval.core.textstats.TokenLengths; import org.apache.tika.eval.core.textstats.TopNTokens; +import org.apache.tika.eval.core.util.MutableInt; /** * @deprecated use {@link CompositeTextStatsCalculator} diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounts.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounts.java index 8b420696c..53b7f4e1a 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounts.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/tokens/TokenCounts.java @@ -19,7 +19,7 @@ package org.apache.tika.eval.core.tokens; import java.util.HashMap; import java.util.Map; -import org.apache.commons.lang3.mutable.MutableInt; +import org.apache.tika.eval.core.util.MutableInt; public class TokenCounts { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/EvalExceptionUtils.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/EvalExceptionUtils.java index 3b9454718..f89179d39 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/EvalExceptionUtils.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/EvalExceptionUtils.java @@ -19,9 +19,8 @@ package org.apache.tika.eval.core.util; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.lang3.StringUtils; - import org.apache.tika.utils.ExceptionUtils; +import org.apache.tika.utils.StringUtils; public class EvalExceptionUtils { diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/MutableInt.java similarity index 54% copy from tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java copy to tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/MutableInt.java index ebb2d0a5f..ed18ba9f3 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/textstats/TokenEntropy.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/util/MutableInt.java @@ -14,27 +14,36 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.eval.core.textstats; +package org.apache.tika.eval.core.util; -import org.apache.commons.lang3.mutable.MutableInt; -import org.apache.commons.math3.util.FastMath; +/** + * non-thread safe mutable int + */ +public class MutableInt { -import org.apache.tika.eval.core.tokens.TokenCounts; + private int val = 0; + public MutableInt(int i) { + this.val = i; + } -public class TokenEntropy implements TokenCountStatsCalculator<Double> { + public void increment() { + val++; + } + public int intValue() { + return val; + } @Override - public Double calculate(TokenCounts tokenCounts) { - double ent = 0.0d; - double p = 0.0d; - double base = 2.0; - double totalTokens = (double) tokenCounts.getTotalTokens(); - for (MutableInt i : tokenCounts.getTokens().values()) { - int termFreq = i.intValue(); - - p = (double) termFreq / totalTokens; - ent += p * FastMath.log(base, p); + public final boolean equals(Object o) { + if (!(o instanceof MutableInt that)) { + return false; } - return -1.0 * ent; + + return val == that.val; + } + + @Override + public int hashCode() { + return val; } } diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TokenCounterTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TokenCounterTest.java index dc687a2b2..87c1608f7 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TokenCounterTest.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/tokens/TokenCounterTest.java @@ -25,13 +25,14 @@ import java.util.HashMap; import java.util.Map; import java.util.Random; -import org.apache.commons.lang3.mutable.MutableInt; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; +import org.apache.tika.eval.core.util.MutableInt; + public class TokenCounterTest { private final static String FIELD = "f"; private static AnalyzerManager analyzerManager; diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java index 7f06cb681..c3b312676 100644 --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java @@ -44,7 +44,6 @@ import com.google.rpc.Status; import io.grpc.protobuf.StatusProto; import io.grpc.stub.StreamObserver; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; @@ -79,6 +78,7 @@ import org.apache.tika.pipes.core.fetcher.AbstractFetcher; import org.apache.tika.pipes.core.fetcher.FetchKey; import org.apache.tika.pipes.core.fetcher.config.AbstractConfig; import org.apache.tika.pipes.core.fetcher.config.FetcherConfigContainer; +import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { @@ -225,7 +225,7 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { try { ParseContext parseContext = new ParseContext(); String additionalFetchConfigJson = request.getAdditionalFetchConfigJson(); - if (StringUtils.isNotBlank(additionalFetchConfigJson)) { + if (! StringUtils.isBlank(additionalFetchConfigJson)) { // The fetch and parse has the option to specify additional configuration AbstractConfig abstractConfig = expiringFetcherStore .getFetcherConfigs() diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 3f00072f7..235adf019 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -339,7 +339,6 @@ <commons.exec.version>1.5.0</commons.exec.version> <commons.fileupload.version>1.6.0</commons.fileupload.version> <commons.io.version>2.20.0</commons.io.version> - <commons.lang3.version>3.19.0</commons.lang3.version> <commons.logging.version>1.3.5</commons.logging.version> <commons.math3.version>3.6.1</commons.math3.version> <commons.net.version>3.12.0</commons.net.version> @@ -883,11 +882,6 @@ <artifactId>commons-exec</artifactId> <version>${commons.exec.version}</version> </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - <version>${commons.lang3.version}</version> - </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-math3</artifactId> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java index 241aa738e..34b6bace2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java @@ -41,8 +41,6 @@ import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonToken; import com.fasterxml.jackson.core.json.JsonReadFeature; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.Strings; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; @@ -57,6 +55,7 @@ import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.ExceptionUtils; import org.apache.tika.utils.FileProcessResult; import org.apache.tika.utils.ProcessUtils; +import org.apache.tika.utils.StringUtils; @@ -245,7 +244,6 @@ public class DWGReadParser extends AbstractDWGParser { if ("text".equals(nextFieldName)) { String textVal = jsonParser.getText(); if (StringUtils.isNotBlank(textVal)) { - textConsumer.accept(textVal); } } else if ("text_value".equals(nextFieldName)) { @@ -312,7 +310,7 @@ public class DWGReadParser extends AbstractDWGParser { metadata.set(TikaCoreProperties.TITLE, textVal); } else if ("LASTSAVEDBY".equals(nextFieldName)) { metadata.set(TikaCoreProperties.MODIFIER, textVal); - } else if (!Strings.CI.startsWith(nextFieldName, "unknown")) { + } else if (! StringUtils.startsWithIgnoreCase(nextFieldName, "unknown")) { metadata.set(nextFieldName, textVal); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml index 8bc0f9667..107504dd7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml @@ -49,11 +49,6 @@ <artifactId>asm</artifactId> <version>${asm.version}</version> </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - </dependency> - <dependency> <groupId>com.epam</groupId> <artifactId>parso</artifactId> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java index 20e12a564..595d77b6d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java @@ -24,7 +24,6 @@ import java.util.Comparator; import java.util.Set; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.tuple.Pair; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -111,7 +110,7 @@ public class UniversalExecutableParser implements Parser { long archsSize = (long) archsCount * archStructSize; var unsortedOffsets = false; - var offsetAndSizePerArch = new Pair[archsCount]; + var offsetAndSizePerArch = new OffsetSize[archsCount]; for (int archIndex = 0; archIndex < archsCount; archIndex++) { IOUtils.skipFully(stream, 8); @@ -121,7 +120,7 @@ public class UniversalExecutableParser implements Parser { if (offset < 4 + 4 + archsSize) { throw new TikaException("Invalid offset: " + offset); } - if (!unsortedOffsets && archIndex > 0 && offset < (long) offsetAndSizePerArch[archIndex - 1].getLeft()) { + if (!unsortedOffsets && archIndex > 0 && offset < (long) offsetAndSizePerArch[archIndex - 1].offset) { unsortedOffsets = true; } long size = is64 @@ -131,7 +130,7 @@ public class UniversalExecutableParser implements Parser { if (size < 0 || size > MAX_ARCH_SIZE) { throw new TikaException("Arch size=" + size + " must be > 0 and < " + MAX_ARCH_SIZE); } - offsetAndSizePerArch[archIndex] = Pair.of(offset, size); + offsetAndSizePerArch[archIndex] = new OffsetSize(offset, size); if (is64) { IOUtils.skipFully(stream, 8); @@ -142,14 +141,14 @@ public class UniversalExecutableParser implements Parser { currentOffset += archStructSize; } if (unsortedOffsets) { - Arrays.sort(offsetAndSizePerArch, Comparator.comparingLong(entry -> (long) entry.getLeft())); + Arrays.sort(offsetAndSizePerArch, Comparator.comparingLong(entry -> (long) entry.offset)); } for (int archIndex = 0; archIndex < archsCount; archIndex++) { - long skipUntilStart = (long)offsetAndSizePerArch[archIndex].getLeft() - currentOffset; + long skipUntilStart = (long)offsetAndSizePerArch[archIndex].offset - currentOffset; IOUtils.skipFully(stream, skipUntilStart); currentOffset += skipUntilStart; - long sz = (long)offsetAndSizePerArch[archIndex].getRight(); + long sz = (long)offsetAndSizePerArch[archIndex].size; //we bounds checked this above. byte[] perArchMachO = new byte[(int)sz]; IOUtils.readFully(stream, perArchMachO); @@ -163,4 +162,7 @@ public class UniversalExecutableParser implements Parser { } } + private record OffsetSize(long offset, long size) { + + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml index 97a19a9a7..db04aed4b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/pom.xml @@ -71,10 +71,6 @@ <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - </dependency> <dependency> <groupId>org.apache.poi</groupId> <artifactId>poi</artifactId> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteDocument.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteDocument.java index a949b0762..26e2f32ef 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteDocument.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteDocument.java @@ -22,8 +22,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import org.apache.commons.lang3.tuple.Pair; - class OneNoteDocument { OneNoteHeader header; List<ExtendedGUID> revisionListOrder = new ArrayList<>(); @@ -32,7 +30,7 @@ class OneNoteDocument { Map<ExtendedGUID, FileChunkReference> guidToRef = new HashMap<>(); Map<ExtendedGUID, FileNodePtr> guidToObject = new HashMap<>(); - Map<ExtendedGUID, Pair<Long, ExtendedGUID>> revisionRoleMap = new HashMap<>(); + Map<ExtendedGUID, RoleGuid> revisionRoleMap = new HashMap<>(); ExtendedGUID currentRevision = ExtendedGUID.nil(); FileNodeList root = new FileNodeList(); @@ -63,7 +61,7 @@ class OneNoteDocument { public void registerAdditionalRevisionRole(ExtendedGUID gosid, long revisionRole, ExtendedGUID gctxid) { - revisionRoleMap.put(gosid, Pair.of(revisionRole, gctxid)); + revisionRoleMap.put(gosid, new RoleGuid(revisionRole, gctxid)); } public List<ExtendedGUID> getRevisionListOrder() { @@ -112,12 +110,12 @@ class OneNoteDocument { return this; } - public Map<ExtendedGUID, Pair<Long, ExtendedGUID>> getRevisionRoleMap() { + public Map<ExtendedGUID, RoleGuid> getRevisionRoleMap() { return revisionRoleMap; } public OneNoteDocument setRevisionRoleMap( - Map<ExtendedGUID, Pair<Long, ExtendedGUID>> revisionRoleMap) { + Map<ExtendedGUID, RoleGuid> revisionRoleMap) { this.revisionRoleMap = revisionRoleMap; return this; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteParser.java index 55913515f..658d4983e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteParser.java @@ -27,7 +27,6 @@ import java.util.Map; import java.util.Set; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.tuple.Pair; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -127,7 +126,7 @@ public class OneNoteParser implements Parser { metadata.set(ONE_NOTE_PREFIX + "rgbPlaceholder", "0x" + Long.toHexString(oneNoteDocument.header.rgbPlaceholder)); - Pair<Long, ExtendedGUID> roleAndContext = Pair.of(1L, ExtendedGUID.nil()); + RoleGuid roleAndContext = new RoleGuid(1L, ExtendedGUID.nil()); OneNoteTreeWalker oneNoteTreeWalker = new OneNoteTreeWalker(options, oneNoteDocument, oneNoteDirectFileResource, xhtml, metadata, context, roleAndContext); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java index 82297e11d..9d4f31043 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java @@ -35,7 +35,6 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.tuple.Pair; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -92,7 +91,7 @@ class OneNoteTreeWalker { private final OneNoteDocument oneNoteDocument; private final OneNoteDirectFileResource dif; private final XHTMLContentHandler xhtml; - private final Pair<Long, ExtendedGUID> roleAndContext; + private final RoleGuid roleAndContext; private Instant lastModifiedTimestamp = Instant.MIN; private long creationTimestamp = Long.MAX_VALUE; private long lastModified = Long.MIN_VALUE; @@ -102,7 +101,7 @@ class OneNoteTreeWalker { /** * Contains pairs of {Offset,Length} that we have added to the text stream already. */ - private final Set<Pair<Long, Integer>> textAlreadyFetched = new HashSet<>(); + private final Set<RoleGuid> textAlreadyFetched = new HashSet<>(); /** * Create a one tree walker. @@ -119,7 +118,7 @@ class OneNoteTreeWalker { public OneNoteTreeWalker(OneNoteTreeWalkerOptions options, OneNoteDocument oneNoteDocument, OneNoteDirectFileResource dif, XHTMLContentHandler xhtml, Metadata parentMetadata, ParseContext parseContext, - Pair<Long, ExtendedGUID> roleAndContext) { + RoleGuid roleAndContext) { this.options = options; this.oneNoteDocument = oneNoteDocument; this.dif = dif; @@ -176,8 +175,8 @@ class OneNoteTreeWalker { * @param revisionRole The revision role Long,GUID pair. * @return True if exists, false if not. */ - private boolean hasRevisionRole(ExtendedGUID rid, Pair<Long, ExtendedGUID> revisionRole) { - Pair<Long, ExtendedGUID> where = oneNoteDocument.revisionRoleMap.get(rid); + private boolean hasRevisionRole(ExtendedGUID rid, RoleGuid revisionRole) { + RoleGuid where = oneNoteDocument.revisionRoleMap.get(rid); return where != null && where.equals(revisionRole); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/RoleGuid.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/RoleGuid.java new file mode 100644 index 000000000..9afae58c2 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/RoleGuid.java @@ -0,0 +1,4 @@ +package org.apache.tika.parser.microsoft.onenote; + +public record RoleGuid(Long role, ExtendedGUID extendedGUID) { +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/fsshttpb/streamobj/chunking/ZipFilesChunking.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/fsshttpb/streamobj/chunking/ZipFilesChunking.java index 19eb2a06d..560cee6b6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/fsshttpb/streamobj/chunking/ZipFilesChunking.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/fsshttpb/streamobj/chunking/ZipFilesChunking.java @@ -24,7 +24,6 @@ import java.util.List; import java.util.concurrent.atomic.AtomicReference; import org.apache.commons.codec.digest.DigestUtils; -import org.apache.commons.lang3.NotImplementedException; import org.apache.tika.exception.TikaException; import org.apache.tika.parser.microsoft.onenote.fsshttpb.streamobj.LeafNodeObject; @@ -218,6 +217,6 @@ public class ZipFilesChunking extends AbstractChunking { */ private SignatureObject getSubChunkSignature() { // In current, it has no idea about how to compute the signature for sub chunk. - throw new NotImplementedException("The Get sub chunk signature method is not implemented."); + throw new IllegalStateException("The Get sub chunk signature method is not implemented."); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java index bab0a6fbc..f9c887316 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/onenote/OneNoteParserTest.java @@ -25,7 +25,6 @@ import java.time.Instant; import java.util.Arrays; import java.util.List; -import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; @@ -285,8 +284,7 @@ public class OneNoteParserTest extends TikaTest { public void testDupeText() throws Exception { Metadata metadata = new Metadata(); String txt = getText("test-tika-3970-dupetext.one", metadata); - - assertEquals(1, StringUtils.countMatches(txt, "Sunday morning")); + assertContainsCount("Sunday morning", txt, 1); } /** diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/pom.xml index a79c83abd..f1a254c96 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/pom.xml @@ -45,10 +45,6 @@ <artifactId>tika-parser-xml-module</artifactId> <version>${project.version}</version> </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-collections4</artifactId> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/mif/MIFExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/mif/MIFExtractor.java index 457895277..b7d40c511 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/mif/MIFExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/mif/MIFExtractor.java @@ -24,15 +24,13 @@ import java.util.Stack; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.lang3.StringEscapeUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.lang3.Strings; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import org.apache.tika.detect.AutoDetectReader; +import org.apache.tika.utils.StringUtils; /** * Helper Class to Parse and Extract Adobe MIF Files. @@ -124,8 +122,7 @@ public class MIFExtractor { handler.startElement(StringUtils.EMPTY, tag.getName(), tag.getName(), attrs); String value = Strings.CS.removeStart(tag.getValue(), START_TAG_VALUE); value = Strings.CS.removeEnd(value, END_TAG_VALUE); - String content = StringEscapeUtils.escapeXml(value); - handler.characters(content.toCharArray(), 0, content.length()); + handler.characters(content.toCharArray(), 0, value.length()); handler.endElement(StringUtils.EMPTY, tag.getName(), tag.getName()); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/pom.xml index 709d3a8ac..7af98521b 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/pom.xml @@ -30,10 +30,6 @@ <name>Apache Tika OCR parser module</name> <dependencies> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-exec</artifactId> diff --git a/tika-server/tika-server-core/pom.xml b/tika-server/tika-server-core/pom.xml index 8dc3c5a82..18a68a6d9 100644 --- a/tika-server/tika-server-core/pom.xml +++ b/tika-server/tika-server-core/pom.xml @@ -95,10 +95,6 @@ <groupId>org.apache.cxf</groupId> <artifactId>cxf-rt-rs-client</artifactId> </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - </dependency> <!-- logging --> <dependency> <groupId>org.slf4j</groupId> diff --git a/tika-server/tika-server-standard/pom.xml b/tika-server/tika-server-standard/pom.xml index 0edd7d1aa..4d877dfe8 100644 --- a/tika-server/tika-server-standard/pom.xml +++ b/tika-server/tika-server-standard/pom.xml @@ -168,7 +168,6 @@ <exclude>org.slf4j:slf4j-api:jar:</exclude> <exclude>commons-logging:commons-logging:jar:</exclude> <exclude>org.apache.cxf:cxf-rt-rs-client:jar:</exclude> - <exclude>org.apache.commons:commons-lang3:jar:</exclude> <exclude>commons-cli:commons-cli:jar:</exclude> <exclude>org.apache.cxf:cxf-rt-rs-security-cors:jar:</exclude> <exclude>org.eclipse.jetty:jetty-io:jar:</exclude>
