This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6d58ea584cf327cc28af1e3fc338094cc70416d2
Author: tallison <[email protected]>
AuthorDate: Fri Jul 24 16:59:04 2020 -0400

    TIKA-3145 -- add TextSha256Signature
---
 .../tika/eval/textstats/BytesRefCalculator.java    | 33 ++++++++++
 .../textstats/CompositeTextStatsCalculator.java    | 74 ++++++++++++++++++----
 .../tika/eval/textstats/TextProfileSignature.java  |  8 ++-
 .../tika/eval/textstats/TextSha256Signature.java   | 54 ++++++++++++++++
 .../apache/tika/eval/textstats/TextStatsTest.java  | 13 +++-
 5 files changed, 165 insertions(+), 17 deletions(-)

diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/BytesRefCalculator.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/BytesRefCalculator.java
new file mode 100644
index 0000000..048b798
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/BytesRefCalculator.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.textstats;
+
+/**
+ * Interface for calculators that require a string
+ * @param <T>
+ */
+public interface BytesRefCalculator<T> extends TextStatsCalculator {
+
+    public BytesRefCalcInstance<T> getInstance();
+
+    interface BytesRefCalcInstance<T> {
+        void update(byte[] bytes, int start, int len);
+        T finish();
+        Class getOuterClass();
+    }
+
+}
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
index 2c7c673..a16c767 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/CompositeTextStatsCalculator.java
@@ -17,29 +17,38 @@
 package org.apache.tika.eval.textstats;
 
 import java.io.IOException;
+import java.security.MessageDigest;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
+import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.BytesTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
+import org.apache.lucene.util.BytesRef;
 import org.apache.tika.eval.langid.Language;
 import org.apache.tika.eval.langid.LanguageIDWrapper;
 import org.apache.tika.eval.tokens.AnalyzerManager;
 import org.apache.tika.eval.tokens.TokenCounts;
+import org.apache.tika.metadata.Message;
 
 
 public class CompositeTextStatsCalculator {
 
     private static final String FIELD = "f";
     private static final int DEFAULT_MAX_TOKENS = 10_000_000;
+    private final byte[] whitespace = new byte[]{' '};
     private final Analyzer analyzer;
     private final LanguageIDWrapper languageIDWrapper;
     private final List<LanguageAwareTokenCountStats> 
languageAwareTokenCountStats = new ArrayList<>();
     private final List<TokenCountStatsCalculator> tokenCountStatCalculators = 
new ArrayList<>();
     private final List<StringStatsCalculator> stringStatCalculators = new 
ArrayList<>();
+    private final List<BytesRefCalculator> bytesRefCalculators = new 
ArrayList<>();
 
     public CompositeTextStatsCalculator(List<TextStatsCalculator> calculators) 
{
         this(calculators,
@@ -68,6 +77,14 @@ public class CompositeTextStatsCalculator {
                                     "a TokenCountStats: "+t.getClass()
                     );
                 }
+            } else if (t instanceof BytesRefCalculator) {
+                bytesRefCalculators.add((BytesRefCalculator)t);
+                if (analyzer == null) {
+                    throw new IllegalArgumentException(
+                            "Analyzer must not be null if you are using "+
+                                    "a BytesRefCalculator: "+t.getClass()
+                    );
+                }
             } else {
                 throw new IllegalArgumentException(
                         "I regret I don't yet handle: "+t.getClass()
@@ -83,9 +100,11 @@ public class CompositeTextStatsCalculator {
         }
 
         TokenCounts tokenCounts = null;
-        if (tokenCountStatCalculators.size() > 0 || 
languageAwareTokenCountStats.size() > 0) {
+        if (tokenCountStatCalculators.size() > 0
+                || languageAwareTokenCountStats.size() > 0
+                || bytesRefCalculators.size() > 0) {
             try {
-                tokenCounts = tokenize(txt);
+                tokenCounts = tokenize(txt, results);
             } catch (IOException e) {
                 throw new RuntimeException(e);
             }
@@ -106,20 +125,51 @@ public class CompositeTextStatsCalculator {
         return results;
     }
 
-    private TokenCounts tokenize(String txt) throws IOException  {
+    private TokenCounts tokenize(String txt, Map<Class, Object> results) 
throws IOException  {
         TokenCounts counts = new TokenCounts();
         TokenStream ts = analyzer.tokenStream(FIELD, txt);
-        try {
-            CharTermAttribute termAtt = 
ts.getAttribute(CharTermAttribute.class);
-            ts.reset();
-            while (ts.incrementToken()) {
-                String token = termAtt.toString();
-                counts.increment(token);
+        if (bytesRefCalculators.size() == 0) {
+            try {
+                CharTermAttribute termAtt = 
ts.getAttribute(CharTermAttribute.class);
+                ts.reset();
+                while (ts.incrementToken()) {
+                    String token = termAtt.toString();
+                    counts.increment(token);
+                }
+            } finally {
+                ts.close();
+                ts.end();
+            }
+        } else {
+            List<BytesRefCalculator.BytesRefCalcInstance> brcis = new 
ArrayList<>();
+            for (BytesRefCalculator brf : bytesRefCalculators) {
+                brcis.add(brf.getInstance());
+            }
+            try {
+                TermToBytesRefAttribute termAtt = 
ts.getAttribute(TermToBytesRefAttribute.class);
+                ts.reset();
+                int i = 0;
+                while (ts.incrementToken()) {
+                    final BytesRef bytesRef = termAtt.getBytesRef();
+                    String token = termAtt.toString();
+                    counts.increment(token);
+                    for (BytesRefCalculator.BytesRefCalcInstance brci : brcis) 
{
+                        if (i > 0) {
+                            brci.update(whitespace, 0, 1);
+                        }
+                        brci.update(bytesRef.bytes, bytesRef.offset, 
bytesRef.length);
+                    }
+                    i++;
+                }
+                for (BytesRefCalculator.BytesRefCalcInstance brc : brcis) {
+                    results.put(brc.getOuterClass(), brc.finish());
+                }
+            } finally {
+                ts.close();
+                ts.end();
             }
-        } finally {
-            ts.close();
-            ts.end();
         }
+
         return counts;
     }
 }
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
index b72cc99..bc65351 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextProfileSignature.java
@@ -16,7 +16,7 @@
  */
 package org.apache.tika.eval.textstats;
 
-import org.apache.commons.codec.binary.Base64;
+import org.apache.commons.codec.binary.Base32;
 import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.commons.lang3.mutable.MutableInt;
 import org.apache.tika.eval.tokens.TokenCounts;
@@ -32,6 +32,8 @@ import java.util.Map;
  * 
https://github.com/apache/nutch/blob/master/src/java/org/apache/nutch/crawl/TextProfileSignature.java
  *
  * See documentation: 
https://nutch.apache.org/apidocs/apidocs-2.0/org/apache/nutch/crawl/TextProfileSignature.html
+ *
+ * This returns the base32 encoded sha256
  */
 public class TextProfileSignature implements TokenCountStatsCalculator<String> 
{
 
@@ -39,7 +41,7 @@ public class TextProfileSignature implements 
TokenCountStatsCalculator<String> {
     float quantRate = 0.01f;
     boolean secondaryLexicographicSorting = true;
 
-    Base64 base64 = new Base64();
+    Base32 base32 = new Base32();
 
     @Override
     public String calculate(TokenCounts tokenCounts) {
@@ -74,7 +76,7 @@ public class TextProfileSignature implements 
TokenCountStatsCalculator<String> {
             }
             newText.append(t.val);
         }
-        return base64.encodeAsString(DigestUtils.sha256(newText.toString()));
+        return base32.encodeAsString(DigestUtils.sha256(newText.toString()));
     }
 
     public void setMinTokenLength(int minTokenLength) {
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextSha256Signature.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextSha256Signature.java
new file mode 100644
index 0000000..eab4e08
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/textstats/TextSha256Signature.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.textstats;
+
+import org.apache.commons.codec.binary.Base32;
+import org.apache.commons.codec.digest.DigestUtils;
+
+import java.security.MessageDigest;
+
+/**
+ * Calculates the base32 encoded SHA-256 checksum on the analyzed text
+ */
+public class TextSha256Signature implements BytesRefCalculator<String> {
+
+    static Base32 BASE32 = new Base32();
+
+    @Override
+    public BytesRefCalcInstance<String> getInstance() {
+        return new TextSha256Instance();
+    }
+
+    class TextSha256Instance implements BytesRefCalcInstance<String> {
+        private MessageDigest messageDigest = DigestUtils.getSha256Digest();
+        @Override
+        public void update(byte[] bytes, int start, int len) {
+            messageDigest.update(bytes, start, len);
+
+        }
+
+        @Override
+        public String finish() {
+            return BASE32.encodeAsString(messageDigest.digest());
+        }
+
+        @Override
+        public Class getOuterClass() {
+            return TextSha256Signature.class;
+        }
+    }
+}
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
index ceb39ad..8206977 100644
--- a/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
+++ b/tika-eval/src/test/java/org/apache/tika/eval/textstats/TextStatsTest.java
@@ -16,11 +16,14 @@
  */
 package org.apache.tika.eval.textstats;
 
+import org.apache.commons.codec.binary.Base32;
+import org.apache.commons.codec.digest.DigestUtils;
 import org.apache.tika.eval.langid.Language;
 import org.apache.tika.eval.langid.LanguageIDWrapper;
 import org.apache.tika.eval.tokens.CommonTokenResult;
 import org.junit.Test;
 
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
@@ -31,12 +34,14 @@ public class TextStatsTest {
 
     @Test
     public void testBasic() throws Exception {
-        String txt = "the quick brown fox &&^&%@! 8675309 jumped over the lazy 
wombat";
+        String txt = "The quick brown fox &&^&%@! 8675309 jumped over tHe lazy 
wombat";
+        String txtCleaned = "the quick brown fox 8675309 jumped over the lazy 
wombat";
         List<TextStatsCalculator> calcs = new ArrayList<>();
         calcs.add(new TextProfileSignature());
         calcs.add(new ContentLengthCalculator());
         calcs.add(new TokenEntropy());
         calcs.add(new CommonTokens());
+        calcs.add(new TextSha256Signature());
         CompositeTextStatsCalculator calc = new 
CompositeTextStatsCalculator(calcs);
 
         Map<Class, Object> stats = calc.calculate(txt);
@@ -60,6 +65,10 @@ public class TextStatsTest {
         assertEquals(0.01, probabilities.get(1).getConfidence(), 0.01);
 
         String textProfileSignature = 
(String)stats.get(TextProfileSignature.class);
-        assertEquals("aKhbjS6iV87VBbf/12OfDCWMBg5aS3Atktl2n4ypg14=", 
textProfileSignature);
+        
assertEquals("NCUFXDJOUJL45VIFW775OY47BQSYYBQOLJFXALMS3F3J7DFJQNPA====", 
textProfileSignature);
+
+        assertEquals(new Base32().encodeAsString(
+                
DigestUtils.sha256(txtCleaned.getBytes(StandardCharsets.UTF_8))),
+                stats.get(TextSha256Signature.class));
     }
 }

Reply via email to