[3/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

tallison Thu, 16 Feb 2017 09:19:11 -0800

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java
new file mode 100644
index 0000000..28e1c78
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.tokens;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+import org.apache.commons.math3.util.FastMath;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+public class TokenCounter {
+
+    private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a";
+
+
+    Map<String, Map<String, MutableInt>> map = new HashMap<>(); //Map<field, 
Map<token, count>>
+    Map<String, TokenStatistics> tokenStatistics = new HashMap<>();
+
+    private final TokenStatistics NULL_TOKEN_STAT = new TokenStatistics(
+            0, 0, new TokenIntPair[0], 0.0d, new SummaryStatistics());
+
+    private final Analyzer generalAnalyzer;
+    private final Analyzer alphaIdeoAnalyzer;
+
+    private int topN = 10;
+
+    public TokenCounter(Analyzer generalAnalyzer, Analyzer alphaIdeoAnalyzer) 
throws IOException {
+        this.generalAnalyzer = generalAnalyzer;
+        this.alphaIdeoAnalyzer = alphaIdeoAnalyzer;
+    }
+
+    public void add(String field, String content) throws IOException {
+        _add(field, generalAnalyzer, content);
+        _add(field+ALPHA_IDEOGRAPH_SUFFIX, alphaIdeoAnalyzer, content);
+    }
+
+    private void _add(String field, Analyzer analyzer, String content) throws 
IOException {
+        int totalTokens = 0;
+
+        TokenStream ts = analyzer.tokenStream(field, content);
+        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+        ts.reset();
+        Map<String, MutableInt> tokenMap = map.get(field);
+        if (tokenMap == null) {
+            tokenMap = new HashMap<>();
+            map.put(field, tokenMap);
+        }
+        while (ts.incrementToken()) {
+            String token = termAtt.toString();
+            MutableInt cnt = tokenMap.get(token);
+            if (cnt == null) {
+                cnt = new MutableInt(1);
+                tokenMap.put(token, cnt);
+            } else {
+                cnt.increment();
+            }
+            totalTokens++;
+        }
+        ts.close();
+        ts.end();
+
+        int totalUniqueTokens = tokenMap.size();
+
+        double ent = 0.0d;
+        double p = 0.0d;
+        double base = 2.0;
+
+        TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN);
+
+        SummaryStatistics summaryStatistics = new SummaryStatistics();
+        for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) {
+            String token = e.getKey();
+            int termFreq = e.getValue().intValue();
+
+            p = (double) termFreq / (double) totalTokens;
+            ent += p * FastMath.log(base, p);
+            int len = token.codePointCount(0, token.length());
+            for (int i = 0; i < e.getValue().intValue(); i++) {
+                summaryStatistics.addValue(len);
+            }
+            if (queue.top() == null || queue.size() < topN ||
+                    termFreq >= queue.top().getValue()) {
+                queue.insertWithOverflow(new TokenIntPair(token, termFreq));
+            }
+
+        }
+        if (totalTokens > 0) {
+            ent = (-1.0d / (double)totalTokens) * ent;
+        }
+
+/*            Collections.sort(allTokens);
+            List<TokenIntPair> topNList = new ArrayList<>(topN);
+            for (int i = 0; i < topN && i < allTokens.size(); i++) {
+                topNList.add(allTokens.get(i));
+            }*/
+
+        tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, 
totalTokens,
+                queue.getArray(), ent, summaryStatistics));
+
+    }
+
+    public TokenStatistics getTokenStatistics(String field) {
+        TokenStatistics tokenStat = tokenStatistics.get(field);
+        if (tokenStat == null) {
+            return NULL_TOKEN_STAT;
+        }
+        return tokenStat;
+    }
+
+    public void setTopN(int topN) {
+        this.topN = topN;
+    }
+
+    public void clear(String field) {
+        Map<String, MutableInt> tokenMap = map.get(field);
+        if (tokenMap != null) {
+            tokenMap.clear();
+        }
+        Map<String, MutableInt> alphaMap = 
map.get(field+ALPHA_IDEOGRAPH_SUFFIX);
+        if (alphaMap != null) {
+            alphaMap.clear();
+        }
+
+        tokenStatistics.put(field+ALPHA_IDEOGRAPH_SUFFIX, NULL_TOKEN_STAT);
+        tokenStatistics.put(field, NULL_TOKEN_STAT);
+    }
+
+    public Map<String, MutableInt> getAlphaTokens(String field) {
+        Map<String, MutableInt> ret = map.get(field+ALPHA_IDEOGRAPH_SUFFIX);
+        if (ret == null) {
+            return Collections.emptyMap();
+        }
+        return ret;
+    }
+
+    public Map<String, MutableInt> getTokens(String field) {
+        Map<String, MutableInt> ret = map.get(field);
+        if (ret == null) {
+            return Collections.emptyMap();
+        }
+        return ret;
+    }
+
+    public TokenStatistics getAlphaTokenStatistics(String fieldName) {
+        return getTokenStatistics(fieldName+ALPHA_IDEOGRAPH_SUFFIX);
+    }
+}


http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java
new file mode 100644
index 0000000..4b57d25
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.tokens;
+
+
+import org.jetbrains.annotations.NotNull;
+
+public class TokenIntPair implements Comparable<TokenIntPair> {
+
+    final String token;
+    final int value;
+
+    public TokenIntPair(String token, int value) {
+        this.token = token;
+        this.value = value;
+    }
+
+    public long getValue() {
+        return value;
+    }
+
+    public String getToken() {
+        return token;
+    }
+
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        TokenIntPair that = (TokenIntPair) o;
+
+        if (value != that.value) return false;
+        return token.equals(that.token);
+    }
+
+    @Override
+    public int hashCode() {
+        int result = token.hashCode();
+        result = 31 * result + value;
+        return result;
+    }
+
+    /**
+     * Descending by value, ascending by token
+     *
+     * @param o other tokenlong pair
+     * @return comparison
+     */
+    @Override
+    public int compareTo(@NotNull TokenIntPair o) {
+        if (this.value > o.value) {
+            return -1;
+        } else if (this.value < o.value) {
+            return 1;
+        }
+        return this.token.compareTo(o.token);
+    }
+
+    @Override
+    public String toString() {
+        return "TokenIntPair{" +
+                "token='" + token + '\'' +
+                ", value=" + value +
+                '}';
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenStatistics.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenStatistics.java 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenStatistics.java
new file mode 100644
index 0000000..62d3d20
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenStatistics.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.tokens;
+
+import java.util.Arrays;
+
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+
+
+public class TokenStatistics {
+
+    private final int totalTokens;
+    private final int totalUniqueTokens;
+    private final TokenIntPair[] topN;
+    private final double entropy;
+    private final SummaryStatistics summaryStatistics;
+
+    public TokenStatistics(int totalUniqueTokens, int totalTokens,
+                           TokenIntPair[] topN,
+                           double entropy, SummaryStatistics 
summaryStatistics) {
+        this.totalUniqueTokens = totalUniqueTokens;
+        this.totalTokens = totalTokens;
+        this.topN = topN;
+        this.entropy = entropy;
+        this.summaryStatistics = summaryStatistics;
+    }
+
+
+    public int getTotalTokens() {
+
+        return totalTokens;
+    }
+
+    public int getTotalUniqueTokens() {
+        return totalUniqueTokens;
+    }
+
+    public TokenIntPair[] getTopN() {
+        return topN;
+    }
+
+    public double getEntropy() {
+        return entropy;
+    }
+
+    public SummaryStatistics getSummaryStatistics() {
+        return summaryStatistics;
+    }
+
+
+    @Override
+    public String toString() {
+        return "TokenStatistics{" +
+                "totalTokens=" + totalTokens +
+                ", totalUniqueTokens=" + totalUniqueTokens +
+                ", topN=" + Arrays.toString(topN) +
+                ", entropy=" + entropy +
+                ", summaryStatistics=" + summaryStatistics +
+                '}';
+    }
+
+    @Override
+    public boolean equals(Object o) {
+
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        TokenStatistics that = (TokenStatistics) o;
+
+        if (totalTokens != that.totalTokens) return false;
+        if (totalUniqueTokens != that.totalUniqueTokens) return false;
+        if (!doubleEquals(that.entropy, entropy)) return false;
+        // Probably incorrect - comparing Object[] arrays with Arrays.equals
+        if (!Arrays.equals(topN, that.topN)) return false;
+
+        SummaryStatistics thatS = ((TokenStatistics) o).summaryStatistics;
+        if (summaryStatistics.getN() != thatS.getN()) return false;
+
+        //if both have n==0, don't bother with the stats
+        if (summaryStatistics.getN() ==0L) return true;
+        //TODO: consider adding others...
+        if (!doubleEquals(summaryStatistics.getGeometricMean(), 
thatS.getGeometricMean())) return false;
+        if (!doubleEquals(summaryStatistics.getMax(), thatS.getMax())) return 
false;
+        if (!doubleEquals(summaryStatistics.getMean(), thatS.getMean())) 
return false;
+        if (!doubleEquals(summaryStatistics.getMin(), thatS.getMin())) return 
false;
+        if (!doubleEquals(summaryStatistics.getSum(), thatS.getSum())) return 
false;
+        if (!doubleEquals(summaryStatistics.getStandardDeviation(), 
thatS.getStandardDeviation())) return false;
+        return true;
+    }
+
+    @Override
+    public int hashCode() {
+        int result;
+        long temp;
+        result = (int) (totalTokens ^ (totalTokens >>> 32));
+        result = 31 * result + (int) (totalUniqueTokens ^ (totalUniqueTokens 
>>> 32));
+        result = 31 * result + Arrays.hashCode(topN);
+        temp = Double.doubleToLongBits(entropy);
+        result = 31 * result + (int) (temp ^ (temp >>> 32));
+        result = 31 * result + summaryStatistics.hashCode();
+        return result;
+    }
+
+    private static boolean doubleEquals(double a, double b) {
+        return doubleEquals(a, b, 0.000000000001d);
+    }
+
+    private static boolean doubleEquals(double a, double b, double epsilon) {
+        return a == b ? true : Math.abs(a - b) < epsilon;
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/util/LanguageIDWrapper.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/util/LanguageIDWrapper.java 
b/tika-eval/src/main/java/org/apache/tika/eval/util/LanguageIDWrapper.java
new file mode 100644
index 0000000..59d032a
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/util/LanguageIDWrapper.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.util;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.List;
+
+import com.google.common.base.Optional;
+import com.optimaize.langdetect.DetectedLanguage;
+import com.optimaize.langdetect.LanguageDetector;
+import com.optimaize.langdetect.LanguageDetectorBuilder;
+import com.optimaize.langdetect.i18n.LdLocale;
+import com.optimaize.langdetect.ngram.NgramExtractors;
+import com.optimaize.langdetect.profiles.LanguageProfile;
+import com.optimaize.langdetect.profiles.LanguageProfileReader;
+import com.optimaize.langdetect.text.CommonTextObjectFactories;
+import com.optimaize.langdetect.text.TextObjectFactory;
+
+
+public class LanguageIDWrapper {
+    static List<LanguageProfile> languageProfiles;
+    static LanguageDetector detector;
+    static TextObjectFactory textObjectFactory;
+
+    public static void loadBuiltInModels() throws IOException {
+
+        languageProfiles = new LanguageProfileReader().readAllBuiltIn();
+        detector = LanguageDetectorBuilder.create(NgramExtractors.standard())
+                .withProfiles(languageProfiles)
+                .build();
+        textObjectFactory = 
CommonTextObjectFactories.forDetectingOnLargeText();
+    }
+
+    public static void loadModels(Path path) throws IOException {
+
+        languageProfiles = new LanguageProfileReader().readAll(path.toFile());
+        detector = LanguageDetectorBuilder.create(NgramExtractors.standard())
+                .withProfiles(languageProfiles)
+                .build();
+        textObjectFactory = 
CommonTextObjectFactories.forDetectingOnLargeText();
+    }
+
+
+
+    public static Optional<LdLocale> detect(String s) {
+        return detector.detect(textObjectFactory.forText(s));
+    }
+
+    public static List<DetectedLanguage> getProbabilities(String s) {
+
+        return detector.getProbabilities(textObjectFactory.forText(s));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
 
b/tika-eval/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
new file mode 100644
index 0000000..1d21002
--- /dev/null
+++ 
b/tika-eval/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
@@ -0,0 +1,17 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.tika.eval.tokens.AlphaIdeographFilterFactory
+org.apache.tika.eval.tokens.CJKBigramAwareLengthFilterFactory
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/comparison-reports.xml
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/resources/comparison-reports.xml 
b/tika-eval/src/main/resources/comparison-reports.xml
new file mode 100644
index 0000000..cb7befd
--- /dev/null
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -0,0 +1,791 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<reports>
+
+
+    <before>
+
+        <sql>drop table if exists md5_multiples_tmp_a</sql>
+        <sql>create table md5_multiples_tmp_a (MD5 char(32), cnt int)
+            as
+            select md5, count(1) cnt
+            from profiles_a
+            where md5 is not null
+            group by md5
+            having cnt &gt; 1
+            order by cnt desc
+        </sql>
+
+        <sql>drop table if exists md5_multiples_tmp_b</sql>
+        <sql>create table md5_multiples_tmp_b (MD5 char(32), cnt int)
+            as
+            select md5, count(1) cnt
+            from profiles_b
+            where md5 is not null
+            group by md5
+            having cnt &gt; 1
+            order by cnt desc
+        </sql>
+        <sql>create index if not exists pa_m_idx
+            on profiles_a (mime_type_id);
+        </sql>
+        <sql>
+            create index if not exists pb_m_idx
+            on profiles_b (mime_type_id);
+        </sql>
+
+        <sql>drop table if exists exceptions_compared</sql>
+        <sql>
+            create table exceptions_compared
+            (mime_type_id integer primary key,
+            exceptions_a integer,
+            total_a integer,
+            percent_exceptions_a double,
+            exceptions_b integer,
+            total_b integer,
+            percent_exceptions_b double)
+        </sql>
+        <sql>
+            insert into exceptions_compared (mime_type_id)
+            select mime_type_id from mimes;
+        </sql>
+
+        <sql>
+            update exceptions_compared ec set total_a=(
+            select count(1) as cnt from profiles_a
+            where profiles_a.mime_type_id= ec.mime_type_id
+            group by mime_type_id
+            )
+        </sql>
+        <sql>
+            update exceptions_compared ec set total_b=(
+            select count(1) as cnt from profiles_b
+            where profiles_b.mime_type_id= ec.mime_type_id
+            group by mime_type_id
+            )
+        </sql>
+        <sql>
+            update exceptions_compared ec set exceptions_a=( select count(1) as
+            cnt from exceptions_a ea
+            join profiles_a pa on ea.id=pa.id
+            where pa.mime_type_id= ec.mime_type_id
+            and parse_exception_type_id=0
+            group by mime_type_id )
+        </sql>
+        <sql>
+            update exceptions_compared ec set exceptions_b=(
+            select count(1) as cnt from exceptions_b eb
+            join profiles_b pb on eb.id=pb.id
+            where pb.mime_type_id= ec.mime_type_id
+            and parse_exception_type_id=0
+            group by mime_type_id )
+        </sql>
+
+    </before>
+
+    <!-- MIMES -->
+    <report reportName="All Mimes In A"
+            reportFilename="mimes/all_mimes_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_a p
+            join mimes m on m.mime_type_id = p.mime_type_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="All Mimes In B"
+            reportFilename="mimes/all_mimes_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_b p
+            join mimes m on m.mime_type_id = p.mime_type_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Container Mimes In A"
+            reportFilename="mimes/container_mimes_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_a p
+            join mimes m on m.mime_type_id = p.mime_type_id
+            where is_embedded=false
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="Container Mimes In B"
+            reportFilename="mimes/container_mimes_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_b p
+            join mimes m on m.mime_type_id = p.mime_type_id
+            where is_embedded=false
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Embedded Mimes In A"
+            reportFilename="mimes/embedded_mimes_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_a p
+            join mimes m on m.mime_type_id = p.mime_type_id
+            where is_embedded=true
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="Embedded Mimes In B"
+            reportFilename="mimes/embedded_mimes_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles_b p
+            join mimes m on m.mime_type_id = p.mime_type_id
+            where is_embedded=true
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Mime Differences A -> B"
+            reportFilename="mimes/mime_diffs_A_to_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select concat(ma.mime_string, ' -&gt; ', mb.mime_string) as
+            MIME_A_TO_MIME_B, count(1) as COUNT
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_type_id=a.mime_type_id
+            join mimes mb on mb.mime_type_id=b.mime_type_id
+            where a.mime_type_id &lt;&gt; b.mime_type_id
+            group by MIME_A_TO_MIME_B
+            order by COUNT DESC
+        </sql>
+    </report>
+
+       <report reportName="Mime Differences A -> B Details"
+            reportFilename="mimes/mime_diffs_A_to_B_details.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select concat(ma.mime_string, ' -&gt; ', mb.mime_string) as
+            MIME_A_TO_MIME_B, file_path, a.file_name
+            from profiles_a a
+            join profiles_b b on a.id=b.id
+            join mimes ma on ma.mime_type_id=a.mime_type_id
+            join mimes mb on mb.mime_type_id=b.mime_type_id
+            join containers c on a.container_id=c.container_id
+            where a.mime_type_id &lt;&gt; b.mime_type_id
+            order by MIME_A_TO_MIME_B
+        </sql>
+    </report>
+
+    <report reportName="AllExceptionsByMimeA"
+            reportFilename="exceptions/exceptions_by_mime_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_a e
+            join profiles_a p on p.id=e.id
+            join mimes m on m.mime_type_id = p.mime_type_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="AllExceptionsByMimeB"
+            reportFilename="exceptions/exceptions_by_mime_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_b e
+            join profiles_b p on p.id=e.id
+            join mimes m on m.mime_type_id = p.mime_type_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="ContainerExceptionsByMimeA"
+            reportFilename="exceptions/container_exceptions_by_mime_A.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_a e
+            join profiles_a p on p.id=e.id
+            join mimes m on m.mime_type_id = p.mime_type_id
+            where is_embedded=false
+            and parse_exception_type_id=0
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="ContainerExceptionsByMimeB"
+            reportFilename="exceptions/container_exceptions_by_mime_B.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            exceptions_b e
+            join profiles_b p on p.id=e.id
+            join mimes m on m.mime_type_id = p.mime_type_id
+            where is_embedded=false
+            and parse_exception_type_id=0
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="AllExceptionsByMimeByTypeA"
+            reportFilename="exceptions/exceptions_by_mime_by_typeA.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string as MIME_TYPE,
+            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+            from exceptions_a e
+            join profiles_a p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_type_id=p.mime_type_id
+            join ref_parse_exception_types r on
+            r.parse_exception_type_id=e.parse_exception_type_id
+            group by p.mime_type_id, parse_exception_description
+            order by MIME_TYPE, EXCEPTION_TYPE
+        </sql>
+    </report>
+
+    <report reportName="AllExceptionsByMimeByTypeB"
+            reportFilename="exceptions/exceptions_by_mime_by_typeB.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string as MIME_TYPE,
+            parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT
+            from exceptions_b e
+            join profiles_b p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_type_id=p.mime_type_id
+            join ref_parse_exception_types r on
+            r.parse_exception_type_id=e.parse_exception_type_id
+            group by p.mime_type_id, parse_exception_description
+            order by MIME_TYPE, EXCEPTION_TYPE
+        </sql>
+    </report>
+
+    <report reportName="TextLostFromACausedByNewExceptionsInB"
+            
reportFilename="exceptions/textLostFromACausedByNewExceptionsInB.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path as FILE_PATH, ca.NUM_TOKENS as NUM_TOKENS_A,
+            cb.NUM_TOKENS as NUM_TOKENS_B,
+            ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A, cb.NUM_UNIQUE_TOKENS
+            as NUM_UNIQUE_TOKENS_B,
+            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
+            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
+            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
+            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+            ca.top_n_tokens as TOP_N_TOKENS_A, cb.top_n_tokens as 
TOP_N_TOKENS_B,
+            eb.ORIG_STACK_TRACE as ORIG_STACK_TRACE_B
+            from contents_a ca
+            join profiles_a pa on ca.id = pa.id
+            join containers c on pa.container_id=c.container_id
+            left join contents_b cb on ca.id=cb.id
+            left join exceptions_b eb on ca.id = eb.id
+            left join exceptions_a ea on ca.id = ea.id
+            where eb.orig_stack_trace is not null
+            and ea.orig_stack_trace is null
+            order by ca.num_common_tokens - ifnull(cb.num_common_tokens,0) desc
+        </sql>
+    </report>
+
+    <report reportName="FixedExceptionsInBByMimeType"
+            reportFilename="exceptions/fixedExceptionsInBByMimeType.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string as MIME_TYPE, count(1) as COUNT
+            from exceptions_a ea
+            left join exceptions_b eb on ea.id = eb.id
+            join profiles_a pa on pa.id=ea.id
+            join profiles_b pb on pa.id=pb.id
+            join containers c on pa.container_id=c.container_id
+            join mimes m on m.mime_type_id=pa.mime_type_id
+            where eb.id is null
+            and ea.parse_exception_type_id=0
+            group by mime_string
+        </sql>
+    </report>
+
+    <report reportName="FixedExceptionsInByDetails"
+            reportFilename="exceptions/fixedExceptionsInBDetails.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select mime_string as MIME_TYPE,
+            file_path, pa.file_name, pa.is_embedded
+            from exceptions_a ea
+            left join exceptions_b eb on ea.id = eb.id
+            join profiles_a pa on pa.id=ea.id
+            join profiles_b pb on pb.id=pa.id //this ensures that files were 
actually processed in both runs
+            join containers c on pa.container_id=c.container_id
+            join mimes m on m.mime_type_id=pa.mime_type_id
+            where eb.id is null
+            and ea.parse_exception_type_id=0
+            order by mime_string
+        </sql>
+    </report>
+    <report reportName="ContentsOfFixedExceptionsInB"
+            reportFilename="exceptions/contentsOfFixedExceptionsInB.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path, mime_string as MIME_TYPE,
+            CONTENT_LENGTH,
+            NUM_TOKENS, NUM_UNIQUE_TOKENS,
+            TOP_N_TOKENS, LANG_ID_1,TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV
+            from exceptions_a ea
+            left join exceptions_b eb on ea.id = eb.id
+            join profiles_a p on p.id=ea.id
+            join contents_b cb on cb.id=ea.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_type_id=p.mime_type_id
+            where eb.id is null
+            and ea.parse_exception_type_id=0
+        </sql>
+    </report>
+
+    <report reportName="NewExceptionsByMimeType"
+            reportFilename="exceptions/newExceptionsInBByMimeType.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string as MIME_TYPE_A, count(1) as COUNT
+            from exceptions_b eb
+            left join exceptions_a ea on ea.id = eb.id
+            join profiles_a pa on pa.id=eb.id
+            join profiles_b pb on pb.id=pa.id
+            join containers c on pa.container_id=c.container_id
+            join mimes m on m.mime_type_id=pa.mime_type_id
+            where ea.id is null
+            and eb.parse_exception_type_id=0
+            group by mime_string
+            order by COUNT desc
+        </sql>
+    </report>
+
+    <report reportName="NewExceptionsInBByMimeTypeByStackTrace"
+            
reportFilename="exceptions/newExceptionsInBByMimeTypeByStackTrace.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select MIME_STRING as MIME_TYPE, eb.sort_stack_trace, count(1) as
+            COUNT
+            from exceptions_b eb
+            left join exceptions_a ea on ea.id = eb.id
+            join profiles_a p on p.id=eb.id
+            join mimes m on m.mime_type_id=p.mime_type_id
+            where ea.id is null
+            and eb.parse_exception_type_id=0
+            group by MIME_TYPE, eb.sort_stack_trace
+            order by MIME_TYPE asc, COUNT desc
+        </sql>
+    </report>
+
+    <report reportName="NewExceptionsInBDetails"
+            reportFilename="exceptions/newExceptionsInBDetails.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path, MIME_STRING as MIME_TYPE, p.length,
+            eb.orig_stack_trace, eb.sort_stack_trace
+            from exceptions_b eb
+            left join exceptions_a ea on ea.id = eb.id
+            join profiles_a p on p.id=eb.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_type_id=p.mime_type_id
+            where ea.id is null
+            and eb.parse_exception_type_id=0
+            order by MIME_TYPE asc, eb.ORIG_STACK_TRACE
+        </sql>
+    </report>
+
+    <report reportName="StackTracesByMimeInA"
+            reportFilename="exceptions/stackTracesByMimeInA.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
+            COUNT
+            from exceptions_a e
+            join profiles_a p on p.id=e.id
+            join mimes m on m.mime_type_id=p.mime_type_id
+            and e.parse_exception_type_id=0
+            group by MIME_TYPE, e.sort_stack_trace
+            order by MIME_TYPE asc, COUNT desc
+        </sql>
+    </report>
+
+    <report reportName="AllStackTracesInA"
+            reportFilename="exceptions/stackTracesInA.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path, c.length as FILE_LENGTH, MIME_STRING as 
MIME_TYPE,
+            orig_stack_trace, sort_stack_trace
+            from exceptions_a e
+            join profiles_a p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_type_id=p.mime_type_id
+            and e.parse_exception_type_id=0
+            order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
+            FILE_LENGTH asc
+        </sql>
+    </report>
+    <report reportName="AllStackTracesInB"
+            reportFilename="exceptions/stackTracesInB.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path, c.length as FILE_LENGTH, MIME_STRING as 
MIME_TYPE,
+            orig_stack_trace, sort_stack_trace
+            from exceptions_b e
+            join profiles_b p on p.id=e.id
+            join containers c on p.container_id=c.container_id
+            join mimes m on m.mime_type_id=p.mime_type_id
+            and e.parse_exception_type_id=0
+            order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
+            FILE_LENGTH asc
+        </sql>
+    </report>
+
+    <report reportName="StackTracesByMimeInB"
+            reportFilename="exceptions/stackTracesByMimeInB.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as
+            COUNT
+            from exceptions_b e
+            join profiles_b p on p.id=e.id
+            join mimes m on m.mime_type_id=p.mime_type_id
+            and e.parse_exception_type_id=0
+            group by MIME_TYPE, e.sort_stack_trace
+            order by MIME_TYPE asc, COUNT desc
+        </sql>
+    </report>
+    <report reportName="extractErrorsA"
+            reportFilename="exceptions/extract_errors_a.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select file_path, extract_error_description
+            from extract_errors_a e
+            join ref_extract_error_types t
+            on e.extract_error_type_id=t.extract_error_type_id
+        </sql>
+    </report>
+    <report reportName="extractErrorsB"
+            reportFilename="exceptions/extract_errors_b.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select file_path, extract_error_description
+            from extract_errors_b e
+            join ref_extract_error_types t
+            on e.extract_error_type_id=t.extract_error_type_id
+        </sql>
+    </report>
+    <report reportName="parseExceptionTypesA"
+            reportFilename="exceptions/overall_exception_types_a.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select parse_exception_description, count(1)
+            from exceptions_a e
+            join ref_parse_exception_types t on
+            t.parse_exception_type_id=e.parse_exception_type_id
+            group by e.parse_exception_type_id
+        </sql>
+    </report>
+    <report reportName="parseExceptionTypesB"
+            reportFilename="exceptions/overall_exception_types_b.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <sql>
+            select parse_exception_description, count(1)
+            from exceptions_b e
+            join ref_parse_exception_types t on
+            t.parse_exception_type_id=e.parse_exception_type_id
+            group by e.parse_exception_type_id
+        </sql>
+    </report>
+
+    <report reportName="contentDiffsWExceptions"
+            reportFilename="content/content_diffs_with_exceptions.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            ma.mime_string as mime_string_a,
+            mb.mime_string as mime_string_b,
+            ca.num_unique_tokens as NUM_UNIQUE_TOKENS_A,
+            cb.num_unique_tokens as NUM_UNIQUE_TOKENS_B,
+            ca.num_tokens as NUM_TOKENS_A,
+            cb.num_tokens as NUM_TOKENS_B,
+            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
+            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
+            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
+            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+            ca.top_n_tokens as TOP_N_TOKENS_A,
+            cb.top_n_tokens as TOP_N_TOKENS_B,
+            ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
+            cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
+            top_10_unique_token_diffs_a,
+            top_10_unique_token_diffs_b,
+            top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap
+            from content_comparisons cc
+            join contents_a ca on ca.id=cc.id
+            left join contents_b cb on cb.id=cc.id
+            join profiles_a pa on pa.id = cc.id
+            join profiles_b pb on pb.id=cc.id
+            join containers c on c.container_id=pa.container_id
+            join mimes ma on ma.mime_type_id=pa.mime_type_id
+            join mimes mb on mb.mime_type_id=pb.mime_type_id
+            left join exceptions_a ea on ea.id=cc.id
+            left join exceptions_b eb on eb.id=cc.id
+            where (overlap &lt; 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) 
&gt;30)
+            and (ea.parse_exception_type_id is null or
+            ea.parse_exception_type_id &lt;&gt; 2)
+            and (eb.parse_exception_type_id is null or
+            eb.parse_exception_type_id &lt;&gt; 2)
+            order by ma.mime_string, overlap asc
+            limit 100000
+        </sql>
+    </report>
+    <report reportName="contentDiffsIgnoreExceptions"
+            reportFilename="content/content_diffs_ignore_exceptions.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            ma.mime_string as mime_string_a,
+            mb.mime_string as mime_string_b,
+            ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A,
+            cb.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_B,
+            ca.NUM_TOKENS as NUM_TOKENS_A,
+            cb.NUM_TOKENS as NUM_TOKENS_B,
+            ca.common_tokens_lang as COMMON_TOKENS_LANG_A,
+            ca.num_common_tokens as NUM_COMMON_TOKENS_A,
+            cb.common_tokens_lang as COMMON_TOKENS_LANG_B,
+            cb.num_common_tokens as NUM_COMMON_TOKENS_B,
+            ca.top_n_tokens as TOP_N_TOKENS_A,
+            cb.top_n_tokens as TOP_N_TOKENS_B,
+            ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A,
+            cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B,
+            top_10_unique_token_diffs_a,
+            top_10_unique_token_diffs_b,
+            top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap
+            from content_comparisons cc
+            join contents_a ca on ca.id=cc.id
+            join contents_b cb on cb.id=cc.id
+            join profiles_a pa on pa.id = cc.id
+            join profiles_b pb on pb.id=cc.id
+            join containers c on c.container_id=pa.container_id
+            join mimes ma on ma.mime_type_id=pa.mime_type_id
+            join mimes mb on mb.mime_type_id=pb.mime_type_id
+            left join exceptions_a ea on ea.id=cc.id
+            left join exceptions_b eb on eb.id=cc.id
+            where (overlap &lt; 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) 
&gt;30)
+            and (ea.parse_exception_type_id is null)
+            and (eb.parse_exception_type_id is null)
+            order by ma.mime_string, overlap asc
+            limit 100000
+        </sql>
+    </report>
+    <!--    <report reportName="MD5 Duplicate Counts A"
+                reportFilename="md5/md5_duplicate_counts_A.xlsx"
+                format="xlsx"
+                            includeSql="true">
+            <sql>
+                select md5, count(1) cnt
+                from profiles_a
+                group by md5
+                having cnt > 2
+                order by cnt desc
+            </sql>
+        </report>
+
+        <report reportName="MD5 Duplicate Counts B"
+                reportFilename="md5/md5_duplicate_counts_B.xlsx"
+                format="xlsx"
+                            includeSql="true">
+
+            <sql>
+                select md5, count(1) cnt
+                from profiles_b
+                group by md5
+                having cnt > 2
+                order by cnt desc
+            </sql>
+        </report>
+
+        <report reportName="MD5 Duplicates A"
+                reportFilename="md5/md5_duplicates_A.xlsx"
+                format="xlsx"
+                            includeSql="true">
+
+            <sql>
+                select file_path, file_name, is_embedded, content_length, 
NUM_TOKENS, p.md5
+                from md5_multiples_tmp_a t
+                join profiles_a p on p.md5 = t.md5
+                join containers c on p.container_id = c.container_id
+                join contents_a cb on p.id=cb.id
+                order by t.cnt desc
+            </sql>
+        </report>
+
+        <report reportName="MD5 Duplicates B"
+                reportFilename="md5/md5_duplicates_B.xlsx"
+                format="xlsx"
+                            includeSql="true">
+
+            <sql>
+                select file_path, file_name, is_embedded, content_length, 
NUM_TOKENS, p.md5
+                from md5_multiples_tmp_b t
+                join profiles_b p on p.md5 = t.md5
+                join containers c on p.container_id = c.container_id
+                join contents_b cb on p.id=cb.id
+                order by t.cnt desc
+            </sql>
+        </report>
+    -->
+
+    <report reportName="Attachment Diffs"
+            reportFilename="attachments/attachment_diffs.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            ma.mime_string as mime_string_a,
+            mb.mime_string as mime_string_b,
+            pa.num_attachments as num_attachments_a,
+            pb.num_attachments as num_attachments_b,
+            ea.parse_exception_type_id as exception_type_id_a,
+            eb.parse_exception_type_id as exception_type_id_b
+            from profiles_a pa
+            join profiles_b pb on pa.id= pb.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on pa.mime_type_id=ma.mime_type_id
+            join mimes mb on pb.mime_type_id=mb.mime_type_id
+            left join exceptions_a ea on ea.id=pa.id
+            left join exceptions_b eb on eb.id=pb.id
+            where pa.is_embedded=false and
+            ea.parse_exception_type_id is null and
+            eb.parse_exception_type_id is null
+            and pa.num_attachments &lt;&gt; pb.num_attachments
+            order by ma.mime_string, pb.num_attachments-pa.num_attachments
+            limit 1000;
+        </sql>
+    </report>
+
+    <!-- metadata values -->
+    <report reportName="Metadata Value Diffs"
+            reportFilename="metadata/metadata_value_count_diffs.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select file_path,
+            ma.mime_string as mime_string_a,
+            mb.mime_string as mime_string_b,
+            pa.num_metadata_values as num_metadata_values_a,
+            pb.num_metadata_values as num_metadata_values_b,
+            ea.parse_exception_type_id as parse_ex_type_id_a,
+            eb.parse_exception_type_id as parse_ex_type_id_b
+            from profiles_a pa
+            join profiles_b pb on pa.id= pb.id
+            join containers c on pa.container_id=c.container_id
+            join mimes ma on pa.mime_type_id=ma.mime_type_id
+            join mimes mb on pb.mime_type_id=mb.mime_type_id
+            left join exceptions_a ea on ea.id=pa.id
+            left join exceptions_b eb on eb.id=pb.id
+            where
+            ea.parse_exception_type_id is null and
+            eb.parse_exception_type_id is null
+            and pa.num_metadata_values &lt;&gt; pb.num_metadata_values
+            order by ma.mime_string,
+            pb.num_metadata_values-pa.num_metadata_values
+        </sql>
+    </report>
+
+    <after>
+        <sql>drop table if exists md5_multiples_tmp_a</sql>
+        <sql>drop table if exists md5_multiples_tmp_b</sql>
+    </after>
+</reports>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/lucene-analyzers.json
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/resources/lucene-analyzers.json 
b/tika-eval/src/main/resources/lucene-analyzers.json
new file mode 100644
index 0000000..268494f
--- /dev/null
+++ b/tika-eval/src/main/resources/lucene-analyzers.json
@@ -0,0 +1,107 @@
+{
+  "analyzers": {
+    "general" :
+    {
+      "charfilters": [
+        {
+          "factory": "oala.charfilter.MappingCharFilterFactory",
+          "params": {
+            "mapping" : "/lucene-char-mapping.txt"
+          }
+        }
+      ],
+      "tokenizer": {
+        "factory": "oala.standard.UAX29URLEmailTokenizerFactory",
+        "params": {}
+      },
+      "tokenfilters": [
+        {
+          "factory": "oala.icu.ICUFoldingFilterFactory",
+          "params": {}
+        },
+        {
+          "factory": "oala.cjk.CJKBigramFilterFactory",
+          "params": {
+            "outputUnigrams" : "false"
+          }
+        }
+      ]
+
+    },
+
+    "alpha" :
+    {
+      "charfilters": [
+        {
+          "factory": "oala.charfilter.MappingCharFilterFactory",
+          "params": {
+            "mapping" : "/lucene-char-mapping.txt"
+          }
+        }
+      ],
+      "tokenizer": {
+        "factory": "oala.standard.UAX29URLEmailTokenizerFactory",
+        "params": {}
+      },
+      "tokenfilters": [
+        {
+          "factory": "oala.icu.ICUFoldingFilterFactory",
+          "params": {}
+        },
+        {
+          "factory": "oala.pattern.PatternReplaceFilterFactory",
+          "params": {
+            "pattern": "^[\\w+\\.]{1,30}@(?:\\w+\\.){1,10}\\w+$",
+            "replacement": "___email___",
+            "replace": "all"
+          }
+        },
+        {
+          "factory": "oala.pattern.PatternReplaceFilterFactory",
+          "params": {
+            "pattern": "^(?:(?:ftp|https?):\\/\\/)?(?:\\w+\\.){1,10}\\w+$",
+            "replacement": "___url___",
+            "replace": "all"
+          }
+        },
+        {
+          "factory": "oala.cjk.CJKBigramFilterFactory",
+          "params": {
+            "outputUnigrams" : "false"
+          }
+        },
+        {
+          "factory": "org.apache.tika.eval.tokens.AlphaIdeographFilterFactory",
+          "params": {}
+        }
+      ]
+
+    },
+    "common_tokens" :
+    {
+      "tokenizer": {
+        "factory": "oala.standard.UAX29URLEmailTokenizerFactory",
+        "params": {}
+      },
+
+      "tokenfilters": [
+        {
+          "factory": "oala.cjk.CJKBigramFilterFactory",
+          "params": {
+            "outputUnigrams" : "false"
+          }
+        },
+        {
+          "factory": 
"org.apache.tika.eval.tokens.CJKBigramAwareLengthFilterFactory",
+          "params": {
+            "min" : 4,
+            "max" : 20
+          }
+        }
+
+      ]
+
+    }
+
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/lucene-char-mapping.txt
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/resources/lucene-char-mapping.txt 
b/tika-eval/src/main/resources/lucene-char-mapping.txt
new file mode 100644
index 0000000..06db6b9
--- /dev/null
+++ b/tika-eval/src/main/resources/lucene-char-mapping.txt
@@ -0,0 +1,2 @@
+"\u2018" => "'"
+"\u2019" => "'"
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/profile-reports.xml
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/resources/profile-reports.xml 
b/tika-eval/src/main/resources/profile-reports.xml
new file mode 100644
index 0000000..2a94a97
--- /dev/null
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -0,0 +1,148 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<reports>
+
+
+    <before>
+        <!-- <sql>create index on x</sql>-->
+    </before>
+
+
+    <!-- MIMES -->
+    <report reportName="All Mimes"
+            reportFilename="mimes/all_mimes.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles p
+            join mimes m on m.mime_type_id = p.mime_type_id
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+    <report reportName="Container Mimes"
+            reportFilename="mimes/container_mimes.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles p
+            join mimes m on m.mime_type_id = p.mime_type_id
+            where is_embedded=false
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <report reportName="Embedded Mimes"
+            reportFilename="mimes/embedded_mimes.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select mime_string, count(1) cnt from
+            profiles p
+            join mimes m on m.mime_type_id = p.mime_type_id
+            where is_embedded=true
+            group by mime_string
+            order by cnt desc
+        </sql>
+    </report>
+
+    <!-- content -->
+    <report reportName="Common Tokens by Lang"
+            reportFilename="content/common_tokens_by_lang.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select common_tokens_lang, sum(num_common_tokens) as cnt
+            from contents
+            group by common_tokens_lang
+            order by cnt desc;
+        </sql>
+    </report>
+
+    <report reportName="Detected Languages"
+            reportFilename="content/detected_langs.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select LANG_ID_1 as DetectedLang, count(1) as cnt
+            from contents
+            group by LANG_ID_1
+            order by cnt desc
+        </sql>
+    </report>
+
+
+
+    <report reportName="Token Count by Detected Language"
+            reportFilename="content/num_tokens_by_detected_langs.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select LANG_ID_1 as DetectedLang, sum(num_tokens) as cnt
+            from contents
+            group by LANG_ID_1
+            order by cnt desc;
+        </sql>
+    </report>
+    <report reportName="Exceptions by Type"
+            reportFilename="exceptions/exceptions_by_type.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select LANG_ID_1 as DetectedLang, count(1) as cnt
+            from contents
+            group by LANG_ID_1
+            order by cnt desc
+        </sql>
+    </report>
+
+
+    <report reportName="Embedded Exceptions by Type"
+            reportFilename="exceptions/exceptions_by_type_embedded.xlsx"
+            format="xlsx"
+            includeSql="true">
+
+        <sql>
+            select parse_exception_description, count(1) cnt
+            from parse_exceptions e
+            join profiles p on p.id = e.id
+            join ref_parse_exception_types et on 
et.parse_exception_type_id=e.parse_exception_type_id
+            where is_embedded=true
+            group by parse_exception_description
+            order by cnt desc;
+        </sql>
+    </report>
+    <after>
+
+        <!--<sql>drop index on x</sql>
+        -->
+    </after>
+</reports>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/tika-eval-comparison-config.xml
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml 
b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
new file mode 100644
index 0000000..b29764e
--- /dev/null
+++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
@@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<tika-batch-config
+        maxAliveTimeSeconds="-1"
+        pauseOnEarlyTerminationMillis = "500"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="5"
+        timeoutThresholdMillis="300000"
+        >
+
+    <commandline>
+    <option opt="c" longOpt="tika-config" hasArg="true"
+            description="TikaConfig file"/>
+    <option opt="bc" longOpt="batch-config" hasArg="true"
+            description="xml batch config file" required="true"/>
+    <option opt="inputDir" hasArg="true"
+            description="dir to start crawling"/>
+    <option opt="numConsumers" hasArg="true"
+            description="number of fileConsumers threads"/>
+    <option opt="extractsA" hasArg="true"
+            description="this dir for analysis" required="false"/>
+    <option opt="extractsB" hasArg="true"
+            description="thatDir for analysis"/>
+    <option opt="db" hasArg="true"
+            description="name of db directory or file to which to write 
results"/>
+    <option opt="alterExtract" hasArg="true"
+                description="for json-formatted extract files
+                process full metadata list ('as_is'=default),
+                take just the first/container document ('first_only'),
+                concatenate all content into the first metadata item 
('concatenate_content')"/>
+    <option opt="includeFilePat" hasArg="true"
+                description="regex for files to include"/>
+    </commandline>
+
+
+    <!--
+        Can also add startDir: this tells the crawler to start indexing a
+        child directory of the srcDir directory.
+    -->
+       <crawler 
builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+               crawlOrder="sorted"
+        maxConsecWaitMillis="30000"
+               maxFilesToAdd="-1" 
+               maxFilesToConsider="-1" 
+               includeFilePat=""
+               excludeFilePat=""
+               maxFileSizeBytes="10000000"
+        />
+
+    <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder"
+               
consumerBuilderClass="org.apache.tika.eval.batch.FileComparerBuilder"
+               dbAppend="false"
+               crawlingInputDir="false"
+               minJsonFileSizeBytes="-1"
+               maxJsonFileSizeBytes="2000000"
+               commonTokens="resources/commontokens"
+            />
+
+<!--               langModelDir="resources/langmodels" -->
+
+       <!-- reporter and interrupter are optional -->
+       <reporter 
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" 
sleepMillis="1000" staleThresholdMillis="500000"/>
+       <interrupter 
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/tika-eval-profiler-config.xml
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml 
b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
new file mode 100644
index 0000000..bd94b25
--- /dev/null
+++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml
@@ -0,0 +1,76 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<tika-batch-config
+        maxAliveTimeSeconds="-1"
+        pauseOnEarlyTerminationMillis="500"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="5"
+        timeoutThresholdMillis="300000">
+
+    <commandline>
+        <option opt="c" longOpt="tika-config" hasArg="true"
+                description="TikaConfig file"/>
+
+        <option opt="bc" longOpt="batch-config" hasArg="true"
+                description="xml batch config file" required="true"/>
+        <option opt="inputDir" hasArg="true"
+                description="dir to start crawling"/>
+        <option opt="numConsumers" hasArg="true"
+                description="number of fileConsumers threads"/>
+        <option opt="extractDir" hasArg="true"
+                description="this dir for analysis" required="false"/>
+        <option opt="db" hasArg="true"
+                description="name of db directory or file to which to write 
results"/>
+        <option opt="alterExtract" hasArg="true"
+                description="for json-formatted extract files
+                process full metadata list ('as_is'=default),
+                take just the first/container document ('first_only'),
+                concatenate all content into the first metadata item 
('concatenate_content')"/>
+
+    </commandline>
+
+
+    <!--
+        Can also add startDir: this tells the crawler to start indexing a
+        child directory of the inputDir directory.
+    -->
+    <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+
+             crawlOrder="sorted"
+             maxConsecWaitMillis="5000"
+             maxFilesToAdd="-1"
+             maxFilesToConsider="-1"
+             includeFilePat=""
+             excludeFilePat=""
+             maxFileSizeBytes="-1"
+    />
+
+    <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder"
+               
consumerBuilderClass="org.apache.tika.eval.batch.SingleFileConsumerBuilder"
+               commonTokens="resources/commontokens"/>
+
+
+    <!-- reporter and interrupter are optional -->
+    <reporter 
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" 
sleepMillis="1000"
+              staleThresholdMillis="500000"/>
+    <interrupter 
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/MockDBWriter.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/test/java/org/apache/tika/MockDBWriter.java 
b/tika-eval/src/test/java/org/apache/tika/MockDBWriter.java
new file mode 100644
index 0000000..b2edab7
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/MockDBWriter.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika;
+
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.eval.io.IDBWriter;
+
+public class MockDBWriter implements IDBWriter {
+    //Map of tableName and tables
+    //each table consists of a list of rows.
+    //Each row consists of a map of columns/values
+    Map<String, List<Map<Cols, String>>> db = new HashMap<String, 
List<Map<Cols, String>>>();
+
+    public MockDBWriter() throws Exception {
+    }
+
+    @Override
+    public void writeRow(TableInfo tableInfo, Map<Cols, String> row) throws 
IOException {
+        List<Map<Cols, String>> table = db.get(tableInfo.getName());
+        if (table == null) {
+            table = new ArrayList<Map<Cols, String>>();
+        }
+        table.add(row);
+        db.put(tableInfo.getName(), table);
+    }
+
+    @Override
+    public void close() throws IOException {
+        //no-op
+    }
+
+    @Override
+    public int getMimeId(String mimeString) {
+        //TODO: fix this
+        return 0;
+    }
+
+    public List<Map<Cols, String>> getTable(TableInfo tableInfo) {
+        if (db.get(tableInfo.getName()) == null) {
+            System.err.println("I can't seem to find: "+ tableInfo.getName() + 
", but I do see:");
+            for (String table : db.keySet()) {
+                System.err.println(table);
+            }
+        }
+        return db.get(tableInfo.getName());
+    }
+
+    public void clear() {
+        db.clear();
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
new file mode 100644
index 0000000..f1b9163
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval;
+
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.tika.eval.tokens.AnalyzerManager;
+import org.junit.Test;
+
+public class AnalyzerManagerTest {
+
+    @Test
+    public void testGeneral() throws Exception {
+        AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
+        Analyzer general = analyzerManager.getGeneralAnalyzer();
+        TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD 
dirty dog");
+        ts.reset();
+
+        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+        Set<String> seen = new HashSet<>();
+        while (ts.incrementToken()) {
+            seen.add(termAtt.toString());
+        }
+        ts.end();
+        ts.close();
+
+        assertTrue(seen.contains("the"));
+        assertTrue(seen.contains("and"));
+        assertTrue(seen.contains("dog"));
+
+    }
+
+    @Test
+    public void testCommon() throws Exception {
+        AnalyzerManager analyzerManager = AnalyzerManager.newInstance();
+        Analyzer common = analyzerManager.getAlphaIdeoAnalyzer();
+        TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog");
+        ts.reset();
+        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+        Set<String> seen = new HashSet<>();
+        while (ts.incrementToken()) {
+            if (termAtt.toString().contains("5")) {
+                fail("Shouldn't have found a numeric");
+            }
+            seen.add(termAtt.toString());
+        }
+        ts.end();
+        ts.close();
+
+        assertTrue(seen.contains("the"));
+        assertTrue(seen.contains("and"));
+        assertTrue(seen.contains("dog"));
+
+
+    }
+
+}

[3/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

Reply via email to