http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java new file mode 100644 index 0000000..28e1c78 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.tokens; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.lang3.mutable.MutableInt; +import org.apache.commons.math3.stat.descriptive.SummaryStatistics; +import org.apache.commons.math3.util.FastMath; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; + +public class TokenCounter { + + private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a"; + + + Map<String, Map<String, MutableInt>> map = new HashMap<>(); //Map<field, Map<token, count>> + Map<String, TokenStatistics> tokenStatistics = new HashMap<>(); + + private final TokenStatistics NULL_TOKEN_STAT = new TokenStatistics( + 0, 0, new TokenIntPair[0], 0.0d, new SummaryStatistics()); + + private final Analyzer generalAnalyzer; + private final Analyzer alphaIdeoAnalyzer; + + private int topN = 10; + + public TokenCounter(Analyzer generalAnalyzer, Analyzer alphaIdeoAnalyzer) throws IOException { + this.generalAnalyzer = generalAnalyzer; + this.alphaIdeoAnalyzer = alphaIdeoAnalyzer; + } + + public void add(String field, String content) throws IOException { + _add(field, generalAnalyzer, content); + _add(field+ALPHA_IDEOGRAPH_SUFFIX, alphaIdeoAnalyzer, content); + } + + private void _add(String field, Analyzer analyzer, String content) throws IOException { + int totalTokens = 0; + + TokenStream ts = analyzer.tokenStream(field, content); + CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); + ts.reset(); + Map<String, MutableInt> tokenMap = map.get(field); + if (tokenMap == null) { + tokenMap = new HashMap<>(); + map.put(field, tokenMap); + } + while (ts.incrementToken()) { + String token = termAtt.toString(); + MutableInt cnt = tokenMap.get(token); + if (cnt == null) { + cnt = new MutableInt(1); + tokenMap.put(token, cnt); + } else { + cnt.increment(); + } + totalTokens++; + } + ts.close(); + ts.end(); + + int totalUniqueTokens = tokenMap.size(); + + double ent = 0.0d; + double p = 0.0d; + double base = 2.0; + + TokenCountPriorityQueue queue = new TokenCountPriorityQueue(topN); + + SummaryStatistics summaryStatistics = new SummaryStatistics(); + for (Map.Entry<String, MutableInt> e : tokenMap.entrySet()) { + String token = e.getKey(); + int termFreq = e.getValue().intValue(); + + p = (double) termFreq / (double) totalTokens; + ent += p * FastMath.log(base, p); + int len = token.codePointCount(0, token.length()); + for (int i = 0; i < e.getValue().intValue(); i++) { + summaryStatistics.addValue(len); + } + if (queue.top() == null || queue.size() < topN || + termFreq >= queue.top().getValue()) { + queue.insertWithOverflow(new TokenIntPair(token, termFreq)); + } + + } + if (totalTokens > 0) { + ent = (-1.0d / (double)totalTokens) * ent; + } + +/* Collections.sort(allTokens); + List<TokenIntPair> topNList = new ArrayList<>(topN); + for (int i = 0; i < topN && i < allTokens.size(); i++) { + topNList.add(allTokens.get(i)); + }*/ + + tokenStatistics.put(field, new TokenStatistics(totalUniqueTokens, totalTokens, + queue.getArray(), ent, summaryStatistics)); + + } + + public TokenStatistics getTokenStatistics(String field) { + TokenStatistics tokenStat = tokenStatistics.get(field); + if (tokenStat == null) { + return NULL_TOKEN_STAT; + } + return tokenStat; + } + + public void setTopN(int topN) { + this.topN = topN; + } + + public void clear(String field) { + Map<String, MutableInt> tokenMap = map.get(field); + if (tokenMap != null) { + tokenMap.clear(); + } + Map<String, MutableInt> alphaMap = map.get(field+ALPHA_IDEOGRAPH_SUFFIX); + if (alphaMap != null) { + alphaMap.clear(); + } + + tokenStatistics.put(field+ALPHA_IDEOGRAPH_SUFFIX, NULL_TOKEN_STAT); + tokenStatistics.put(field, NULL_TOKEN_STAT); + } + + public Map<String, MutableInt> getAlphaTokens(String field) { + Map<String, MutableInt> ret = map.get(field+ALPHA_IDEOGRAPH_SUFFIX); + if (ret == null) { + return Collections.emptyMap(); + } + return ret; + } + + public Map<String, MutableInt> getTokens(String field) { + Map<String, MutableInt> ret = map.get(field); + if (ret == null) { + return Collections.emptyMap(); + } + return ret; + } + + public TokenStatistics getAlphaTokenStatistics(String fieldName) { + return getTokenStatistics(fieldName+ALPHA_IDEOGRAPH_SUFFIX); + } +}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java new file mode 100644 index 0000000..4b57d25 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.tokens; + + +import org.jetbrains.annotations.NotNull; + +public class TokenIntPair implements Comparable<TokenIntPair> { + + final String token; + final int value; + + public TokenIntPair(String token, int value) { + this.token = token; + this.value = value; + } + + public long getValue() { + return value; + } + + public String getToken() { + return token; + } + + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + TokenIntPair that = (TokenIntPair) o; + + if (value != that.value) return false; + return token.equals(that.token); + } + + @Override + public int hashCode() { + int result = token.hashCode(); + result = 31 * result + value; + return result; + } + + /** + * Descending by value, ascending by token + * + * @param o other tokenlong pair + * @return comparison + */ + @Override + public int compareTo(@NotNull TokenIntPair o) { + if (this.value > o.value) { + return -1; + } else if (this.value < o.value) { + return 1; + } + return this.token.compareTo(o.token); + } + + @Override + public String toString() { + return "TokenIntPair{" + + "token='" + token + '\'' + + ", value=" + value + + '}'; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenStatistics.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenStatistics.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenStatistics.java new file mode 100644 index 0000000..62d3d20 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenStatistics.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.tokens; + +import java.util.Arrays; + +import org.apache.commons.math3.stat.descriptive.SummaryStatistics; + + +public class TokenStatistics { + + private final int totalTokens; + private final int totalUniqueTokens; + private final TokenIntPair[] topN; + private final double entropy; + private final SummaryStatistics summaryStatistics; + + public TokenStatistics(int totalUniqueTokens, int totalTokens, + TokenIntPair[] topN, + double entropy, SummaryStatistics summaryStatistics) { + this.totalUniqueTokens = totalUniqueTokens; + this.totalTokens = totalTokens; + this.topN = topN; + this.entropy = entropy; + this.summaryStatistics = summaryStatistics; + } + + + public int getTotalTokens() { + + return totalTokens; + } + + public int getTotalUniqueTokens() { + return totalUniqueTokens; + } + + public TokenIntPair[] getTopN() { + return topN; + } + + public double getEntropy() { + return entropy; + } + + public SummaryStatistics getSummaryStatistics() { + return summaryStatistics; + } + + + @Override + public String toString() { + return "TokenStatistics{" + + "totalTokens=" + totalTokens + + ", totalUniqueTokens=" + totalUniqueTokens + + ", topN=" + Arrays.toString(topN) + + ", entropy=" + entropy + + ", summaryStatistics=" + summaryStatistics + + '}'; + } + + @Override + public boolean equals(Object o) { + + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + TokenStatistics that = (TokenStatistics) o; + + if (totalTokens != that.totalTokens) return false; + if (totalUniqueTokens != that.totalUniqueTokens) return false; + if (!doubleEquals(that.entropy, entropy)) return false; + // Probably incorrect - comparing Object[] arrays with Arrays.equals + if (!Arrays.equals(topN, that.topN)) return false; + + SummaryStatistics thatS = ((TokenStatistics) o).summaryStatistics; + if (summaryStatistics.getN() != thatS.getN()) return false; + + //if both have n==0, don't bother with the stats + if (summaryStatistics.getN() ==0L) return true; + //TODO: consider adding others... + if (!doubleEquals(summaryStatistics.getGeometricMean(), thatS.getGeometricMean())) return false; + if (!doubleEquals(summaryStatistics.getMax(), thatS.getMax())) return false; + if (!doubleEquals(summaryStatistics.getMean(), thatS.getMean())) return false; + if (!doubleEquals(summaryStatistics.getMin(), thatS.getMin())) return false; + if (!doubleEquals(summaryStatistics.getSum(), thatS.getSum())) return false; + if (!doubleEquals(summaryStatistics.getStandardDeviation(), thatS.getStandardDeviation())) return false; + return true; + } + + @Override + public int hashCode() { + int result; + long temp; + result = (int) (totalTokens ^ (totalTokens >>> 32)); + result = 31 * result + (int) (totalUniqueTokens ^ (totalUniqueTokens >>> 32)); + result = 31 * result + Arrays.hashCode(topN); + temp = Double.doubleToLongBits(entropy); + result = 31 * result + (int) (temp ^ (temp >>> 32)); + result = 31 * result + summaryStatistics.hashCode(); + return result; + } + + private static boolean doubleEquals(double a, double b) { + return doubleEquals(a, b, 0.000000000001d); + } + + private static boolean doubleEquals(double a, double b, double epsilon) { + return a == b ? true : Math.abs(a - b) < epsilon; + } + + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/util/LanguageIDWrapper.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/util/LanguageIDWrapper.java b/tika-eval/src/main/java/org/apache/tika/eval/util/LanguageIDWrapper.java new file mode 100644 index 0000000..59d032a --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/util/LanguageIDWrapper.java @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.util; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; + +import com.google.common.base.Optional; +import com.optimaize.langdetect.DetectedLanguage; +import com.optimaize.langdetect.LanguageDetector; +import com.optimaize.langdetect.LanguageDetectorBuilder; +import com.optimaize.langdetect.i18n.LdLocale; +import com.optimaize.langdetect.ngram.NgramExtractors; +import com.optimaize.langdetect.profiles.LanguageProfile; +import com.optimaize.langdetect.profiles.LanguageProfileReader; +import com.optimaize.langdetect.text.CommonTextObjectFactories; +import com.optimaize.langdetect.text.TextObjectFactory; + + +public class LanguageIDWrapper { + static List<LanguageProfile> languageProfiles; + static LanguageDetector detector; + static TextObjectFactory textObjectFactory; + + public static void loadBuiltInModels() throws IOException { + + languageProfiles = new LanguageProfileReader().readAllBuiltIn(); + detector = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(languageProfiles) + .build(); + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } + + public static void loadModels(Path path) throws IOException { + + languageProfiles = new LanguageProfileReader().readAll(path.toFile()); + detector = LanguageDetectorBuilder.create(NgramExtractors.standard()) + .withProfiles(languageProfiles) + .build(); + textObjectFactory = CommonTextObjectFactories.forDetectingOnLargeText(); + } + + + + public static Optional<LdLocale> detect(String s) { + return detector.detect(textObjectFactory.forText(s)); + } + + public static List<DetectedLanguage> getProbabilities(String s) { + + return detector.getProbabilities(textObjectFactory.forText(s)); + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory b/tika-eval/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory new file mode 100644 index 0000000..1d21002 --- /dev/null +++ b/tika-eval/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.tika.eval.tokens.AlphaIdeographFilterFactory +org.apache.tika.eval.tokens.CJKBigramAwareLengthFilterFactory \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/comparison-reports.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml new file mode 100644 index 0000000..cb7befd --- /dev/null +++ b/tika-eval/src/main/resources/comparison-reports.xml @@ -0,0 +1,791 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<reports> + + + <before> + + <sql>drop table if exists md5_multiples_tmp_a</sql> + <sql>create table md5_multiples_tmp_a (MD5 char(32), cnt int) + as + select md5, count(1) cnt + from profiles_a + where md5 is not null + group by md5 + having cnt > 1 + order by cnt desc + </sql> + + <sql>drop table if exists md5_multiples_tmp_b</sql> + <sql>create table md5_multiples_tmp_b (MD5 char(32), cnt int) + as + select md5, count(1) cnt + from profiles_b + where md5 is not null + group by md5 + having cnt > 1 + order by cnt desc + </sql> + <sql>create index if not exists pa_m_idx + on profiles_a (mime_type_id); + </sql> + <sql> + create index if not exists pb_m_idx + on profiles_b (mime_type_id); + </sql> + + <sql>drop table if exists exceptions_compared</sql> + <sql> + create table exceptions_compared + (mime_type_id integer primary key, + exceptions_a integer, + total_a integer, + percent_exceptions_a double, + exceptions_b integer, + total_b integer, + percent_exceptions_b double) + </sql> + <sql> + insert into exceptions_compared (mime_type_id) + select mime_type_id from mimes; + </sql> + + <sql> + update exceptions_compared ec set total_a=( + select count(1) as cnt from profiles_a + where profiles_a.mime_type_id= ec.mime_type_id + group by mime_type_id + ) + </sql> + <sql> + update exceptions_compared ec set total_b=( + select count(1) as cnt from profiles_b + where profiles_b.mime_type_id= ec.mime_type_id + group by mime_type_id + ) + </sql> + <sql> + update exceptions_compared ec set exceptions_a=( select count(1) as + cnt from exceptions_a ea + join profiles_a pa on ea.id=pa.id + where pa.mime_type_id= ec.mime_type_id + and parse_exception_type_id=0 + group by mime_type_id ) + </sql> + <sql> + update exceptions_compared ec set exceptions_b=( + select count(1) as cnt from exceptions_b eb + join profiles_b pb on eb.id=pb.id + where pb.mime_type_id= ec.mime_type_id + and parse_exception_type_id=0 + group by mime_type_id ) + </sql> + + </before> + + <!-- MIMES --> + <report reportName="All Mimes In A" + reportFilename="mimes/all_mimes_A.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + profiles_a p + join mimes m on m.mime_type_id = p.mime_type_id + group by mime_string + order by cnt desc + </sql> + </report> + + <report reportName="All Mimes In B" + reportFilename="mimes/all_mimes_B.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + profiles_b p + join mimes m on m.mime_type_id = p.mime_type_id + group by mime_string + order by cnt desc + </sql> + </report> + <report reportName="Container Mimes In A" + reportFilename="mimes/container_mimes_A.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + profiles_a p + join mimes m on m.mime_type_id = p.mime_type_id + where is_embedded=false + group by mime_string + order by cnt desc + </sql> + </report> + + <report reportName="Container Mimes In B" + reportFilename="mimes/container_mimes_B.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + profiles_b p + join mimes m on m.mime_type_id = p.mime_type_id + where is_embedded=false + group by mime_string + order by cnt desc + </sql> + </report> + <report reportName="Embedded Mimes In A" + reportFilename="mimes/embedded_mimes_A.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + profiles_a p + join mimes m on m.mime_type_id = p.mime_type_id + where is_embedded=true + group by mime_string + order by cnt desc + </sql> + </report> + + <report reportName="Embedded Mimes In B" + reportFilename="mimes/embedded_mimes_B.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + profiles_b p + join mimes m on m.mime_type_id = p.mime_type_id + where is_embedded=true + group by mime_string + order by cnt desc + </sql> + </report> + <report reportName="Mime Differences A -> B" + reportFilename="mimes/mime_diffs_A_to_B.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select concat(ma.mime_string, ' -> ', mb.mime_string) as + MIME_A_TO_MIME_B, count(1) as COUNT + from profiles_a a + join profiles_b b on a.id=b.id + join mimes ma on ma.mime_type_id=a.mime_type_id + join mimes mb on mb.mime_type_id=b.mime_type_id + where a.mime_type_id <> b.mime_type_id + group by MIME_A_TO_MIME_B + order by COUNT DESC + </sql> + </report> + + <report reportName="Mime Differences A -> B Details" + reportFilename="mimes/mime_diffs_A_to_B_details.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select concat(ma.mime_string, ' -> ', mb.mime_string) as + MIME_A_TO_MIME_B, file_path, a.file_name + from profiles_a a + join profiles_b b on a.id=b.id + join mimes ma on ma.mime_type_id=a.mime_type_id + join mimes mb on mb.mime_type_id=b.mime_type_id + join containers c on a.container_id=c.container_id + where a.mime_type_id <> b.mime_type_id + order by MIME_A_TO_MIME_B + </sql> + </report> + + <report reportName="AllExceptionsByMimeA" + reportFilename="exceptions/exceptions_by_mime_A.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + exceptions_a e + join profiles_a p on p.id=e.id + join mimes m on m.mime_type_id = p.mime_type_id + group by mime_string + order by cnt desc + </sql> + </report> + <report reportName="AllExceptionsByMimeB" + reportFilename="exceptions/exceptions_by_mime_B.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + exceptions_b e + join profiles_b p on p.id=e.id + join mimes m on m.mime_type_id = p.mime_type_id + group by mime_string + order by cnt desc + </sql> + </report> + + <report reportName="ContainerExceptionsByMimeA" + reportFilename="exceptions/container_exceptions_by_mime_A.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + exceptions_a e + join profiles_a p on p.id=e.id + join mimes m on m.mime_type_id = p.mime_type_id + where is_embedded=false + and parse_exception_type_id=0 + group by mime_string + order by cnt desc + </sql> + </report> + + <report reportName="ContainerExceptionsByMimeB" + reportFilename="exceptions/container_exceptions_by_mime_B.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + exceptions_b e + join profiles_b p on p.id=e.id + join mimes m on m.mime_type_id = p.mime_type_id + where is_embedded=false + and parse_exception_type_id=0 + group by mime_string + order by cnt desc + </sql> + </report> + <report reportName="AllExceptionsByMimeByTypeA" + reportFilename="exceptions/exceptions_by_mime_by_typeA.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string as MIME_TYPE, + parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT + from exceptions_a e + join profiles_a p on p.id=e.id + join containers c on p.container_id=c.container_id + join mimes m on m.mime_type_id=p.mime_type_id + join ref_parse_exception_types r on + r.parse_exception_type_id=e.parse_exception_type_id + group by p.mime_type_id, parse_exception_description + order by MIME_TYPE, EXCEPTION_TYPE + </sql> + </report> + + <report reportName="AllExceptionsByMimeByTypeB" + reportFilename="exceptions/exceptions_by_mime_by_typeB.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string as MIME_TYPE, + parse_exception_description as EXCEPTION_TYPE, count(1) as COUNT + from exceptions_b e + join profiles_b p on p.id=e.id + join containers c on p.container_id=c.container_id + join mimes m on m.mime_type_id=p.mime_type_id + join ref_parse_exception_types r on + r.parse_exception_type_id=e.parse_exception_type_id + group by p.mime_type_id, parse_exception_description + order by MIME_TYPE, EXCEPTION_TYPE + </sql> + </report> + + <report reportName="TextLostFromACausedByNewExceptionsInB" + reportFilename="exceptions/textLostFromACausedByNewExceptionsInB.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select file_path as FILE_PATH, ca.NUM_TOKENS as NUM_TOKENS_A, + cb.NUM_TOKENS as NUM_TOKENS_B, + ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A, cb.NUM_UNIQUE_TOKENS + as NUM_UNIQUE_TOKENS_B, + ca.common_tokens_lang as COMMON_TOKENS_LANG_A, + ca.num_common_tokens as NUM_COMMON_TOKENS_A, + cb.common_tokens_lang as COMMON_TOKENS_LANG_B, + cb.num_common_tokens as NUM_COMMON_TOKENS_B, + ca.top_n_tokens as TOP_N_TOKENS_A, cb.top_n_tokens as TOP_N_TOKENS_B, + eb.ORIG_STACK_TRACE as ORIG_STACK_TRACE_B + from contents_a ca + join profiles_a pa on ca.id = pa.id + join containers c on pa.container_id=c.container_id + left join contents_b cb on ca.id=cb.id + left join exceptions_b eb on ca.id = eb.id + left join exceptions_a ea on ca.id = ea.id + where eb.orig_stack_trace is not null + and ea.orig_stack_trace is null + order by ca.num_common_tokens - ifnull(cb.num_common_tokens,0) desc + </sql> + </report> + + <report reportName="FixedExceptionsInBByMimeType" + reportFilename="exceptions/fixedExceptionsInBByMimeType.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string as MIME_TYPE, count(1) as COUNT + from exceptions_a ea + left join exceptions_b eb on ea.id = eb.id + join profiles_a pa on pa.id=ea.id + join profiles_b pb on pa.id=pb.id + join containers c on pa.container_id=c.container_id + join mimes m on m.mime_type_id=pa.mime_type_id + where eb.id is null + and ea.parse_exception_type_id=0 + group by mime_string + </sql> + </report> + + <report reportName="FixedExceptionsInByDetails" + reportFilename="exceptions/fixedExceptionsInBDetails.xlsx" + format="xlsx" + includeSql="true"> + <sql> + select mime_string as MIME_TYPE, + file_path, pa.file_name, pa.is_embedded + from exceptions_a ea + left join exceptions_b eb on ea.id = eb.id + join profiles_a pa on pa.id=ea.id + join profiles_b pb on pb.id=pa.id //this ensures that files were actually processed in both runs + join containers c on pa.container_id=c.container_id + join mimes m on m.mime_type_id=pa.mime_type_id + where eb.id is null + and ea.parse_exception_type_id=0 + order by mime_string + </sql> + </report> + <report reportName="ContentsOfFixedExceptionsInB" + reportFilename="exceptions/contentsOfFixedExceptionsInB.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select file_path, mime_string as MIME_TYPE, + CONTENT_LENGTH, + NUM_TOKENS, NUM_UNIQUE_TOKENS, + TOP_N_TOKENS, LANG_ID_1,TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV + from exceptions_a ea + left join exceptions_b eb on ea.id = eb.id + join profiles_a p on p.id=ea.id + join contents_b cb on cb.id=ea.id + join containers c on p.container_id=c.container_id + join mimes m on m.mime_type_id=p.mime_type_id + where eb.id is null + and ea.parse_exception_type_id=0 + </sql> + </report> + + <report reportName="NewExceptionsByMimeType" + reportFilename="exceptions/newExceptionsInBByMimeType.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string as MIME_TYPE_A, count(1) as COUNT + from exceptions_b eb + left join exceptions_a ea on ea.id = eb.id + join profiles_a pa on pa.id=eb.id + join profiles_b pb on pb.id=pa.id + join containers c on pa.container_id=c.container_id + join mimes m on m.mime_type_id=pa.mime_type_id + where ea.id is null + and eb.parse_exception_type_id=0 + group by mime_string + order by COUNT desc + </sql> + </report> + + <report reportName="NewExceptionsInBByMimeTypeByStackTrace" + reportFilename="exceptions/newExceptionsInBByMimeTypeByStackTrace.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select MIME_STRING as MIME_TYPE, eb.sort_stack_trace, count(1) as + COUNT + from exceptions_b eb + left join exceptions_a ea on ea.id = eb.id + join profiles_a p on p.id=eb.id + join mimes m on m.mime_type_id=p.mime_type_id + where ea.id is null + and eb.parse_exception_type_id=0 + group by MIME_TYPE, eb.sort_stack_trace + order by MIME_TYPE asc, COUNT desc + </sql> + </report> + + <report reportName="NewExceptionsInBDetails" + reportFilename="exceptions/newExceptionsInBDetails.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select file_path, MIME_STRING as MIME_TYPE, p.length, + eb.orig_stack_trace, eb.sort_stack_trace + from exceptions_b eb + left join exceptions_a ea on ea.id = eb.id + join profiles_a p on p.id=eb.id + join containers c on p.container_id=c.container_id + join mimes m on m.mime_type_id=p.mime_type_id + where ea.id is null + and eb.parse_exception_type_id=0 + order by MIME_TYPE asc, eb.ORIG_STACK_TRACE + </sql> + </report> + + <report reportName="StackTracesByMimeInA" + reportFilename="exceptions/stackTracesByMimeInA.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as + COUNT + from exceptions_a e + join profiles_a p on p.id=e.id + join mimes m on m.mime_type_id=p.mime_type_id + and e.parse_exception_type_id=0 + group by MIME_TYPE, e.sort_stack_trace + order by MIME_TYPE asc, COUNT desc + </sql> + </report> + + <report reportName="AllStackTracesInA" + reportFilename="exceptions/stackTracesInA.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select file_path, c.length as FILE_LENGTH, MIME_STRING as MIME_TYPE, + orig_stack_trace, sort_stack_trace + from exceptions_a e + join profiles_a p on p.id=e.id + join containers c on p.container_id=c.container_id + join mimes m on m.mime_type_id=p.mime_type_id + and e.parse_exception_type_id=0 + order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace, + FILE_LENGTH asc + </sql> + </report> + <report reportName="AllStackTracesInB" + reportFilename="exceptions/stackTracesInB.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select file_path, c.length as FILE_LENGTH, MIME_STRING as MIME_TYPE, + orig_stack_trace, sort_stack_trace + from exceptions_b e + join profiles_b p on p.id=e.id + join containers c on p.container_id=c.container_id + join mimes m on m.mime_type_id=p.mime_type_id + and e.parse_exception_type_id=0 + order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace, + FILE_LENGTH asc + </sql> + </report> + + <report reportName="StackTracesByMimeInB" + reportFilename="exceptions/stackTracesByMimeInB.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select MIME_STRING as MIME_TYPE, e.sort_stack_trace, count(1) as + COUNT + from exceptions_b e + join profiles_b p on p.id=e.id + join mimes m on m.mime_type_id=p.mime_type_id + and e.parse_exception_type_id=0 + group by MIME_TYPE, e.sort_stack_trace + order by MIME_TYPE asc, COUNT desc + </sql> + </report> + <report reportName="extractErrorsA" + reportFilename="exceptions/extract_errors_a.xlsx" + format="xlsx" + includeSql="true"> + <sql> + select file_path, extract_error_description + from extract_errors_a e + join ref_extract_error_types t + on e.extract_error_type_id=t.extract_error_type_id + </sql> + </report> + <report reportName="extractErrorsB" + reportFilename="exceptions/extract_errors_b.xlsx" + format="xlsx" + includeSql="true"> + <sql> + select file_path, extract_error_description + from extract_errors_b e + join ref_extract_error_types t + on e.extract_error_type_id=t.extract_error_type_id + </sql> + </report> + <report reportName="parseExceptionTypesA" + reportFilename="exceptions/overall_exception_types_a.xlsx" + format="xlsx" + includeSql="true"> + <sql> + select parse_exception_description, count(1) + from exceptions_a e + join ref_parse_exception_types t on + t.parse_exception_type_id=e.parse_exception_type_id + group by e.parse_exception_type_id + </sql> + </report> + <report reportName="parseExceptionTypesB" + reportFilename="exceptions/overall_exception_types_b.xlsx" + format="xlsx" + includeSql="true"> + <sql> + select parse_exception_description, count(1) + from exceptions_b e + join ref_parse_exception_types t on + t.parse_exception_type_id=e.parse_exception_type_id + group by e.parse_exception_type_id + </sql> + </report> + + <report reportName="contentDiffsWExceptions" + reportFilename="content/content_diffs_with_exceptions.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select file_path, + ma.mime_string as mime_string_a, + mb.mime_string as mime_string_b, + ca.num_unique_tokens as NUM_UNIQUE_TOKENS_A, + cb.num_unique_tokens as NUM_UNIQUE_TOKENS_B, + ca.num_tokens as NUM_TOKENS_A, + cb.num_tokens as NUM_TOKENS_B, + ca.common_tokens_lang as COMMON_TOKENS_LANG_A, + ca.num_common_tokens as NUM_COMMON_TOKENS_A, + cb.common_tokens_lang as COMMON_TOKENS_LANG_B, + cb.num_common_tokens as NUM_COMMON_TOKENS_B, + ca.top_n_tokens as TOP_N_TOKENS_A, + cb.top_n_tokens as TOP_N_TOKENS_B, + ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A, + cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B, + top_10_unique_token_diffs_a, + top_10_unique_token_diffs_b, + top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap + from content_comparisons cc + join contents_a ca on ca.id=cc.id + left join contents_b cb on cb.id=cc.id + join profiles_a pa on pa.id = cc.id + join profiles_b pb on pb.id=cc.id + join containers c on c.container_id=pa.container_id + join mimes ma on ma.mime_type_id=pa.mime_type_id + join mimes mb on mb.mime_type_id=pb.mime_type_id + left join exceptions_a ea on ea.id=cc.id + left join exceptions_b eb on eb.id=cc.id + where (overlap < 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) >30) + and (ea.parse_exception_type_id is null or + ea.parse_exception_type_id <> 2) + and (eb.parse_exception_type_id is null or + eb.parse_exception_type_id <> 2) + order by ma.mime_string, overlap asc + limit 100000 + </sql> + </report> + <report reportName="contentDiffsIgnoreExceptions" + reportFilename="content/content_diffs_ignore_exceptions.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select file_path, + ma.mime_string as mime_string_a, + mb.mime_string as mime_string_b, + ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A, + cb.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_B, + ca.NUM_TOKENS as NUM_TOKENS_A, + cb.NUM_TOKENS as NUM_TOKENS_B, + ca.common_tokens_lang as COMMON_TOKENS_LANG_A, + ca.num_common_tokens as NUM_COMMON_TOKENS_A, + cb.common_tokens_lang as COMMON_TOKENS_LANG_B, + cb.num_common_tokens as NUM_COMMON_TOKENS_B, + ca.top_n_tokens as TOP_N_TOKENS_A, + cb.top_n_tokens as TOP_N_TOKENS_B, + ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A, + cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B, + top_10_unique_token_diffs_a, + top_10_unique_token_diffs_b, + top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap + from content_comparisons cc + join contents_a ca on ca.id=cc.id + join contents_b cb on cb.id=cc.id + join profiles_a pa on pa.id = cc.id + join profiles_b pb on pb.id=cc.id + join containers c on c.container_id=pa.container_id + join mimes ma on ma.mime_type_id=pa.mime_type_id + join mimes mb on mb.mime_type_id=pb.mime_type_id + left join exceptions_a ea on ea.id=cc.id + left join exceptions_b eb on eb.id=cc.id + where (overlap < 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) >30) + and (ea.parse_exception_type_id is null) + and (eb.parse_exception_type_id is null) + order by ma.mime_string, overlap asc + limit 100000 + </sql> + </report> + <!-- <report reportName="MD5 Duplicate Counts A" + reportFilename="md5/md5_duplicate_counts_A.xlsx" + format="xlsx" + includeSql="true"> + <sql> + select md5, count(1) cnt + from profiles_a + group by md5 + having cnt > 2 + order by cnt desc + </sql> + </report> + + <report reportName="MD5 Duplicate Counts B" + reportFilename="md5/md5_duplicate_counts_B.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select md5, count(1) cnt + from profiles_b + group by md5 + having cnt > 2 + order by cnt desc + </sql> + </report> + + <report reportName="MD5 Duplicates A" + reportFilename="md5/md5_duplicates_A.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5 + from md5_multiples_tmp_a t + join profiles_a p on p.md5 = t.md5 + join containers c on p.container_id = c.container_id + join contents_a cb on p.id=cb.id + order by t.cnt desc + </sql> + </report> + + <report reportName="MD5 Duplicates B" + reportFilename="md5/md5_duplicates_B.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select file_path, file_name, is_embedded, content_length, NUM_TOKENS, p.md5 + from md5_multiples_tmp_b t + join profiles_b p on p.md5 = t.md5 + join containers c on p.container_id = c.container_id + join contents_b cb on p.id=cb.id + order by t.cnt desc + </sql> + </report> + --> + + <report reportName="Attachment Diffs" + reportFilename="attachments/attachment_diffs.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select file_path, + ma.mime_string as mime_string_a, + mb.mime_string as mime_string_b, + pa.num_attachments as num_attachments_a, + pb.num_attachments as num_attachments_b, + ea.parse_exception_type_id as exception_type_id_a, + eb.parse_exception_type_id as exception_type_id_b + from profiles_a pa + join profiles_b pb on pa.id= pb.id + join containers c on pa.container_id=c.container_id + join mimes ma on pa.mime_type_id=ma.mime_type_id + join mimes mb on pb.mime_type_id=mb.mime_type_id + left join exceptions_a ea on ea.id=pa.id + left join exceptions_b eb on eb.id=pb.id + where pa.is_embedded=false and + ea.parse_exception_type_id is null and + eb.parse_exception_type_id is null + and pa.num_attachments <> pb.num_attachments + order by ma.mime_string, pb.num_attachments-pa.num_attachments + limit 1000; + </sql> + </report> + + <!-- metadata values --> + <report reportName="Metadata Value Diffs" + reportFilename="metadata/metadata_value_count_diffs.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select file_path, + ma.mime_string as mime_string_a, + mb.mime_string as mime_string_b, + pa.num_metadata_values as num_metadata_values_a, + pb.num_metadata_values as num_metadata_values_b, + ea.parse_exception_type_id as parse_ex_type_id_a, + eb.parse_exception_type_id as parse_ex_type_id_b + from profiles_a pa + join profiles_b pb on pa.id= pb.id + join containers c on pa.container_id=c.container_id + join mimes ma on pa.mime_type_id=ma.mime_type_id + join mimes mb on pb.mime_type_id=mb.mime_type_id + left join exceptions_a ea on ea.id=pa.id + left join exceptions_b eb on eb.id=pb.id + where + ea.parse_exception_type_id is null and + eb.parse_exception_type_id is null + and pa.num_metadata_values <> pb.num_metadata_values + order by ma.mime_string, + pb.num_metadata_values-pa.num_metadata_values + </sql> + </report> + + <after> + <sql>drop table if exists md5_multiples_tmp_a</sql> + <sql>drop table if exists md5_multiples_tmp_b</sql> + </after> +</reports> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/lucene-analyzers.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/resources/lucene-analyzers.json b/tika-eval/src/main/resources/lucene-analyzers.json new file mode 100644 index 0000000..268494f --- /dev/null +++ b/tika-eval/src/main/resources/lucene-analyzers.json @@ -0,0 +1,107 @@ +{ + "analyzers": { + "general" : + { + "charfilters": [ + { + "factory": "oala.charfilter.MappingCharFilterFactory", + "params": { + "mapping" : "/lucene-char-mapping.txt" + } + } + ], + "tokenizer": { + "factory": "oala.standard.UAX29URLEmailTokenizerFactory", + "params": {} + }, + "tokenfilters": [ + { + "factory": "oala.icu.ICUFoldingFilterFactory", + "params": {} + }, + { + "factory": "oala.cjk.CJKBigramFilterFactory", + "params": { + "outputUnigrams" : "false" + } + } + ] + + }, + + "alpha" : + { + "charfilters": [ + { + "factory": "oala.charfilter.MappingCharFilterFactory", + "params": { + "mapping" : "/lucene-char-mapping.txt" + } + } + ], + "tokenizer": { + "factory": "oala.standard.UAX29URLEmailTokenizerFactory", + "params": {} + }, + "tokenfilters": [ + { + "factory": "oala.icu.ICUFoldingFilterFactory", + "params": {} + }, + { + "factory": "oala.pattern.PatternReplaceFilterFactory", + "params": { + "pattern": "^[\\w+\\.]{1,30}@(?:\\w+\\.){1,10}\\w+$", + "replacement": "___email___", + "replace": "all" + } + }, + { + "factory": "oala.pattern.PatternReplaceFilterFactory", + "params": { + "pattern": "^(?:(?:ftp|https?):\\/\\/)?(?:\\w+\\.){1,10}\\w+$", + "replacement": "___url___", + "replace": "all" + } + }, + { + "factory": "oala.cjk.CJKBigramFilterFactory", + "params": { + "outputUnigrams" : "false" + } + }, + { + "factory": "org.apache.tika.eval.tokens.AlphaIdeographFilterFactory", + "params": {} + } + ] + + }, + "common_tokens" : + { + "tokenizer": { + "factory": "oala.standard.UAX29URLEmailTokenizerFactory", + "params": {} + }, + + "tokenfilters": [ + { + "factory": "oala.cjk.CJKBigramFilterFactory", + "params": { + "outputUnigrams" : "false" + } + }, + { + "factory": "org.apache.tika.eval.tokens.CJKBigramAwareLengthFilterFactory", + "params": { + "min" : 4, + "max" : 20 + } + } + + ] + + } + + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/lucene-char-mapping.txt ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/resources/lucene-char-mapping.txt b/tika-eval/src/main/resources/lucene-char-mapping.txt new file mode 100644 index 0000000..06db6b9 --- /dev/null +++ b/tika-eval/src/main/resources/lucene-char-mapping.txt @@ -0,0 +1,2 @@ +"\u2018" => "'" +"\u2019" => "'" \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/profile-reports.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml new file mode 100644 index 0000000..2a94a97 --- /dev/null +++ b/tika-eval/src/main/resources/profile-reports.xml @@ -0,0 +1,148 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<reports> + + + <before> + <!-- <sql>create index on x</sql>--> + </before> + + + <!-- MIMES --> + <report reportName="All Mimes" + reportFilename="mimes/all_mimes.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + profiles p + join mimes m on m.mime_type_id = p.mime_type_id + group by mime_string + order by cnt desc + </sql> + </report> + <report reportName="Container Mimes" + reportFilename="mimes/container_mimes.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + profiles p + join mimes m on m.mime_type_id = p.mime_type_id + where is_embedded=false + group by mime_string + order by cnt desc + </sql> + </report> + + <report reportName="Embedded Mimes" + reportFilename="mimes/embedded_mimes.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select mime_string, count(1) cnt from + profiles p + join mimes m on m.mime_type_id = p.mime_type_id + where is_embedded=true + group by mime_string + order by cnt desc + </sql> + </report> + + <!-- content --> + <report reportName="Common Tokens by Lang" + reportFilename="content/common_tokens_by_lang.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select common_tokens_lang, sum(num_common_tokens) as cnt + from contents + group by common_tokens_lang + order by cnt desc; + </sql> + </report> + + <report reportName="Detected Languages" + reportFilename="content/detected_langs.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select LANG_ID_1 as DetectedLang, count(1) as cnt + from contents + group by LANG_ID_1 + order by cnt desc + </sql> + </report> + + + + <report reportName="Token Count by Detected Language" + reportFilename="content/num_tokens_by_detected_langs.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select LANG_ID_1 as DetectedLang, sum(num_tokens) as cnt + from contents + group by LANG_ID_1 + order by cnt desc; + </sql> + </report> + <report reportName="Exceptions by Type" + reportFilename="exceptions/exceptions_by_type.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select LANG_ID_1 as DetectedLang, count(1) as cnt + from contents + group by LANG_ID_1 + order by cnt desc + </sql> + </report> + + + <report reportName="Embedded Exceptions by Type" + reportFilename="exceptions/exceptions_by_type_embedded.xlsx" + format="xlsx" + includeSql="true"> + + <sql> + select parse_exception_description, count(1) cnt + from parse_exceptions e + join profiles p on p.id = e.id + join ref_parse_exception_types et on et.parse_exception_type_id=e.parse_exception_type_id + where is_embedded=true + group by parse_exception_description + order by cnt desc; + </sql> + </report> + <after> + + <!--<sql>drop index on x</sql> + --> + </after> +</reports> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/tika-eval-comparison-config.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml b/tika-eval/src/main/resources/tika-eval-comparison-config.xml new file mode 100644 index 0000000..b29764e --- /dev/null +++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml @@ -0,0 +1,83 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<tika-batch-config + maxAliveTimeSeconds="-1" + pauseOnEarlyTerminationMillis = "500" + timeoutCheckPulseMillis="1000" + maxQueueSize="10000" + numConsumers="5" + timeoutThresholdMillis="300000" + > + + <commandline> + <option opt="c" longOpt="tika-config" hasArg="true" + description="TikaConfig file"/> + <option opt="bc" longOpt="batch-config" hasArg="true" + description="xml batch config file" required="true"/> + <option opt="inputDir" hasArg="true" + description="dir to start crawling"/> + <option opt="numConsumers" hasArg="true" + description="number of fileConsumers threads"/> + <option opt="extractsA" hasArg="true" + description="this dir for analysis" required="false"/> + <option opt="extractsB" hasArg="true" + description="thatDir for analysis"/> + <option opt="db" hasArg="true" + description="name of db directory or file to which to write results"/> + <option opt="alterExtract" hasArg="true" + description="for json-formatted extract files + process full metadata list ('as_is'=default), + take just the first/container document ('first_only'), + concatenate all content into the first metadata item ('concatenate_content')"/> + <option opt="includeFilePat" hasArg="true" + description="regex for files to include"/> + </commandline> + + + <!-- + Can also add startDir: this tells the crawler to start indexing a + child directory of the srcDir directory. + --> + <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder" + crawlOrder="sorted" + maxConsecWaitMillis="30000" + maxFilesToAdd="-1" + maxFilesToConsider="-1" + includeFilePat="" + excludeFilePat="" + maxFileSizeBytes="10000000" + /> + + <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder" + consumerBuilderClass="org.apache.tika.eval.batch.FileComparerBuilder" + dbAppend="false" + crawlingInputDir="false" + minJsonFileSizeBytes="-1" + maxJsonFileSizeBytes="2000000" + commonTokens="resources/commontokens" + /> + +<!-- langModelDir="resources/langmodels" --> + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000" staleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> +</tika-batch-config> http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/resources/tika-eval-profiler-config.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/resources/tika-eval-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-profiler-config.xml new file mode 100644 index 0000000..bd94b25 --- /dev/null +++ b/tika-eval/src/main/resources/tika-eval-profiler-config.xml @@ -0,0 +1,76 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<tika-batch-config + maxAliveTimeSeconds="-1" + pauseOnEarlyTerminationMillis="500" + timeoutCheckPulseMillis="1000" + maxQueueSize="10000" + numConsumers="5" + timeoutThresholdMillis="300000"> + + <commandline> + <option opt="c" longOpt="tika-config" hasArg="true" + description="TikaConfig file"/> + + <option opt="bc" longOpt="batch-config" hasArg="true" + description="xml batch config file" required="true"/> + <option opt="inputDir" hasArg="true" + description="dir to start crawling"/> + <option opt="numConsumers" hasArg="true" + description="number of fileConsumers threads"/> + <option opt="extractDir" hasArg="true" + description="this dir for analysis" required="false"/> + <option opt="db" hasArg="true" + description="name of db directory or file to which to write results"/> + <option opt="alterExtract" hasArg="true" + description="for json-formatted extract files + process full metadata list ('as_is'=default), + take just the first/container document ('first_only'), + concatenate all content into the first metadata item ('concatenate_content')"/> + + </commandline> + + + <!-- + Can also add startDir: this tells the crawler to start indexing a + child directory of the inputDir directory. + --> + <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder" + + crawlOrder="sorted" + maxConsecWaitMillis="5000" + maxFilesToAdd="-1" + maxFilesToConsider="-1" + includeFilePat="" + excludeFilePat="" + maxFileSizeBytes="-1" + /> + + <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder" + consumerBuilderClass="org.apache.tika.eval.batch.SingleFileConsumerBuilder" + commonTokens="resources/commontokens"/> + + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000" + staleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> +</tika-batch-config> http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/MockDBWriter.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/MockDBWriter.java b/tika-eval/src/test/java/org/apache/tika/MockDBWriter.java new file mode 100644 index 0000000..b2edab7 --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/MockDBWriter.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika; + + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.tika.eval.db.Cols; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.eval.io.IDBWriter; + +public class MockDBWriter implements IDBWriter { + //Map of tableName and tables + //each table consists of a list of rows. + //Each row consists of a map of columns/values + Map<String, List<Map<Cols, String>>> db = new HashMap<String, List<Map<Cols, String>>>(); + + public MockDBWriter() throws Exception { + } + + @Override + public void writeRow(TableInfo tableInfo, Map<Cols, String> row) throws IOException { + List<Map<Cols, String>> table = db.get(tableInfo.getName()); + if (table == null) { + table = new ArrayList<Map<Cols, String>>(); + } + table.add(row); + db.put(tableInfo.getName(), table); + } + + @Override + public void close() throws IOException { + //no-op + } + + @Override + public int getMimeId(String mimeString) { + //TODO: fix this + return 0; + } + + public List<Map<Cols, String>> getTable(TableInfo tableInfo) { + if (db.get(tableInfo.getName()) == null) { + System.err.println("I can't seem to find: "+ tableInfo.getName() + ", but I do see:"); + for (String table : db.keySet()) { + System.err.println(table); + } + } + return db.get(tableInfo.getName()); + } + + public void clear() { + db.clear(); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java new file mode 100644 index 0000000..f1b9163 --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval; + +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.tika.eval.tokens.AnalyzerManager; +import org.junit.Test; + +public class AnalyzerManagerTest { + + @Test + public void testGeneral() throws Exception { + AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); + Analyzer general = analyzerManager.getGeneralAnalyzer(); + TokenStream ts = general.tokenStream("f", "tHe quick aaaa aaa anD dirty dog"); + ts.reset(); + + CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); + Set<String> seen = new HashSet<>(); + while (ts.incrementToken()) { + seen.add(termAtt.toString()); + } + ts.end(); + ts.close(); + + assertTrue(seen.contains("the")); + assertTrue(seen.contains("and")); + assertTrue(seen.contains("dog")); + + } + + @Test + public void testCommon() throws Exception { + AnalyzerManager analyzerManager = AnalyzerManager.newInstance(); + Analyzer common = analyzerManager.getAlphaIdeoAnalyzer(); + TokenStream ts = common.tokenStream("f", "the 5,000.12 and dirty dog"); + ts.reset(); + CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); + Set<String> seen = new HashSet<>(); + while (ts.incrementToken()) { + if (termAtt.toString().contains("5")) { + fail("Shouldn't have found a numeric"); + } + seen.add(termAtt.toString()); + } + ts.end(); + ts.close(); + + assertTrue(seen.contains("the")); + assertTrue(seen.contains("and")); + assertTrue(seen.contains("dog")); + + + } + +}
