This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0c69e21031cf2a862430e88fba00637d1d10010c Author: tallison <[email protected]> AuthorDate: Fri Jul 24 14:56:14 2020 -0400 TIKA-3146 -- add Nutch's TextProfileSignature to tika-eval --- .../tika/eval/textstats/TextProfileSignature.java | 114 +++++++++++++++++++++ .../apache/tika/eval/textstats/TextStatsTest.java | 65 ++++++++++++ 2 files changed, 179 insertions(+) diff --git a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java new file mode 100644 index 0000000..b72cc99 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.textstats; + +import org.apache.commons.codec.binary.Base64; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.commons.lang3.mutable.MutableInt; +import org.apache.tika.eval.tokens.TokenCounts; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +/** + * Copied nearly directly from Apache Nutch: + * https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/TextProfileSignature.java + * + * See documentation: https://nutch.apache.org/apidocs/apidocs-2.0/org/apache/nutch/crawl/TextProfileSignature.html + */ +public class TextProfileSignature implements TokenCountStatsCalculator<String> { + + int minTokenLength = 2; + float quantRate = 0.01f; + boolean secondaryLexicographicSorting = true; + + Base64 base64 = new Base64(); + + @Override + public String calculate(TokenCounts tokenCounts) { + int maxFreq = -1; + for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){ + if (e.getKey().length() > minTokenLength) { + if (e.getValue().intValue() > maxFreq) { + maxFreq = e.getValue().intValue(); + } + } + } + + int quant = Math.round(maxFreq * quantRate); + if (quant < 2) { + if (maxFreq > 1) { + quant = 2; + } else { + quant = 1; + } + } + + List<Token> profile = new ArrayList<>(); + for (Map.Entry<String, MutableInt> e : tokenCounts.getTokens().entrySet()){ + profile.add(new Token((e.getValue().intValue()/quant)*quant, e.getKey())); + } + Collections.sort(profile, new TokenComparator()); + StringBuffer newText = new StringBuffer(); + int i = 0; + for (Token t : profile) { + if (i++ > 0) { + newText.append("\n"); + } + newText.append(t.val); + } + return base64.encodeAsString(DigestUtils.sha256(newText.toString())); + } + + public void setMinTokenLength(int minTokenLength) { + this.minTokenLength = minTokenLength; + } + + public void setQuantRate(float quantRate) { + this.quantRate = quantRate; + } + private static class Token { + public int cnt; + public String val; + + public Token(int cnt, String val) { + this.cnt = cnt; + this.val = val; + } + + public String toString() { + return val + " " + cnt; + } + } + + private class TokenComparator implements Comparator<Token> { + /** + * Sort tokens first by decreasing frequency and second in lexicographic + * (Unicode) order + */ + public int compare(Token t1, Token t2) { + int diffCnt = t2.cnt - t1.cnt; + if (diffCnt == 0 && secondaryLexicographicSorting) { + return t1.val.compareTo(t2.val); + } + return diffCnt; + } + } +} diff --git a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java new file mode 100644 index 0000000..ceb39ad --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.textstats; + +import org.apache.tika.eval.langid.Language; +import org.apache.tika.eval.langid.LanguageIDWrapper; +import org.apache.tika.eval.tokens.CommonTokenResult; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class TextStatsTest { + + @Test + public void testBasic() throws Exception { + String txt = "the quick brown fox &&^&%@! 8675309 jumped over the lazy wombat"; + List<TextStatsCalculator> calcs = new ArrayList<>(); + calcs.add(new TextProfileSignature()); + calcs.add(new ContentLengthCalculator()); + calcs.add(new TokenEntropy()); + calcs.add(new CommonTokens()); + CompositeTextStatsCalculator calc = new CompositeTextStatsCalculator(calcs); + + Map<Class, Object> stats = calc.calculate(txt); + + + CommonTokenResult ctr = (CommonTokenResult)stats.get(CommonTokens.class); + assertEquals("eng", ctr.getLangCode()); + assertEquals( 9, ctr.getAlphabeticTokens()); + assertEquals( 8, ctr.getCommonTokens()); + assertEquals( 7, ctr.getUniqueCommonTokens()); + assertEquals( 8, ctr.getUniqueAlphabeticTokens()); + assertEquals( 0.11, ctr.getOOV(), 0.02); + + + assertEquals(63, (int)stats.get(ContentLengthCalculator.class)); + + assertEquals(3.12, (double)stats.get(TokenEntropy.class), 0.01); + + List<Language> probabilities = (List<Language>) stats.get(LanguageIDWrapper.class); + assertEquals("eng", probabilities.get(0).getLanguage()); + assertEquals(0.01, probabilities.get(1).getConfidence(), 0.01); + + String textProfileSignature = (String)stats.get(TextProfileSignature.class); + assertEquals("aKhbjS6iV87VBbf/12OfDCWMBg5aS3Atktl2n4ypg14=", textProfileSignature); + } +}
