This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 86cdbb11a53e040afbce4bbc91d3d1b95efa8f4c Author: tallison <[email protected]> AuthorDate: Tue Apr 14 10:57:25 2020 -0400 add optional postgres dialect for comparison reports; improve initialization of ref tables in tika-eval --- .../tika/eval/batch/EvalConsumerBuilder.java | 37 +++++++++------- ...rison-reports.xml => comparison-reports-pg.xml} | 49 +++++++++++----------- .../src/main/resources/comparison-reports.xml | 3 +- 3 files changed, 50 insertions(+), 39 deletions(-) diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java index 694b05e..b50d4a1 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java @@ -19,12 +19,11 @@ package org.apache.tika.eval.batch; import java.io.IOException; import java.sql.Connection; +import java.sql.ResultSet; import java.sql.SQLException; import java.util.HashMap; import java.util.List; -import java.util.Locale; import java.util.Map; -import java.util.Set; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; @@ -41,9 +40,13 @@ import org.apache.tika.eval.io.ExtractReader; import org.apache.tika.eval.io.ExtractReaderException; import org.apache.tika.eval.io.IDBWriter; import org.apache.tika.util.PropsUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public abstract class EvalConsumerBuilder { - private AtomicInteger count = new AtomicInteger(0); + + private static final Logger LOG = LoggerFactory.getLogger(EvalConsumerBuilder.class); + protected ArrayBlockingQueue<FileResource> queue; Map<String, String> localAttrs; JDBCUtil dbUtil; @@ -75,7 +78,7 @@ public abstract class EvalConsumerBuilder { //step 3. create mime buffer this.mimeBuffer = new MimeBuffer(dbUtil.getConnection(), TikaConfig.getDefaultConfig()); - //step 4. populate the reference tabless + //step 4. populate the reference tables populateRefTables(); return mimeBuffer; @@ -100,21 +103,27 @@ public abstract class EvalConsumerBuilder { protected abstract void addErrorLogTablePairs(DBConsumersManager manager); public void populateRefTables() throws IOException, SQLException { - //test for one ref table. If it exists, don't populate ref tables - //TODO: test one at a time - boolean tableExists = false; + boolean refTablesPopulated = true; try (Connection connection = dbUtil.getConnection()) { - Set<String> tables = dbUtil.getTables(connection); - if (tables.contains( - AbstractProfiler.REF_PARSE_ERROR_TYPES.getName().toLowerCase(Locale.US) - )) { - tableExists = true; + for (TableInfo tableInfo : getRefTableInfos()) { + int rows = 0; + try (ResultSet rs = connection.createStatement().executeQuery("select * from "+ + tableInfo.getName())) { + while (rs.next()) { + rows++; + } + } + if (rows == 0) { + refTablesPopulated = false; + break; + } + } } catch (SQLException e) { //swallow } - - if (tableExists) { + if (refTablesPopulated) { + LOG.info("ref tables are already populated"); return; } diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports-pg.xml similarity index 97% copy from tika-eval/src/main/resources/comparison-reports.xml copy to tika-eval/src/main/resources/comparison-reports-pg.xml index e84454a..5bcf88e 100644 --- a/tika-eval/src/main/resources/comparison-reports.xml +++ b/tika-eval/src/main/resources/comparison-reports-pg.xml @@ -25,24 +25,24 @@ <before> <sql>drop table if exists md5_multiples_tmp_a</sql> - <sql>create table md5_multiples_tmp_a (MD5 char(32), cnt int) + <sql>create table md5_multiples_tmp_a (MD5, cnt) as - select md5, count(1) cnt + select md5, count(1) as cnt from profiles_a where md5 is not null group by md5 - having cnt > 1 + having count(1) > 1 order by cnt desc </sql> <sql>drop table if exists md5_multiples_tmp_b</sql> - <sql>create table md5_multiples_tmp_b (MD5 char(32), cnt int) + <sql>create table md5_multiples_tmp_b (MD5, cnt) as select md5, count(1) cnt from profiles_b where md5 is not null group by md5 - having cnt > 1 + having count(1) > 1 order by cnt desc </sql> <!-- build mime indexes --> @@ -133,12 +133,12 @@ create table token_counts_compared (mime_id_a integer, mime_id_b integer, - num_tokens_a long default 0, - num_tokens_b long default 0, - num_alphabetic_tokens_a long default 0, - num_alphabetic_tokens_b long default 0, - num_common_tokens_a long default 0, - num_common_tokens_b long default 0 + num_tokens_a bigint default 0, + num_tokens_b bigint default 0, + num_alphabetic_tokens_a bigint default 0, + num_alphabetic_tokens_b bigint default 0, + num_common_tokens_a bigint default 0, + num_common_tokens_b bigint default 0 ); </sql> <sql> @@ -674,7 +674,7 @@ mime_id_b integer, total_a bigint, total_b bigint, - prcnt_increase double + prcnt_increase double precision ); </sql> <sql> @@ -704,7 +704,7 @@ group by mime_id_a, mime_id_b) </sql> <sql> - update parse_time_compared ptc set prcnt_increase=( + update parse_time_compared ptc set prcnt_increase=(100.0 * cast(total_b as decimal)/cast(total_a as decimal)) where total_a > 0; </sql> @@ -913,7 +913,7 @@ join mimes m on m.mime_id=p.mime_id join ref_parse_exception_types r on r.parse_exception_id=e.parse_exception_id - group by p.mime_id, parse_exception_description + group by m.mime_string, parse_exception_description order by MIME_TYPE, EXCEPTION_TYPE </sql> </report> @@ -932,7 +932,7 @@ join mimes m on m.mime_id=p.mime_id join ref_parse_exception_types r on r.parse_exception_id=e.parse_exception_id - group by p.mime_id, parse_exception_description + group by m.mime_string, parse_exception_description order by MIME_TYPE, EXCEPTION_TYPE </sql> </report> @@ -963,7 +963,7 @@ left join exceptions_a ea on ca.id = ea.id where eb.orig_stack_trace is not null and ea.orig_stack_trace is null - order by ca.num_common_tokens - ifnull(cb.num_common_tokens,0) desc + order by ca.num_common_tokens - coalesce(cb.num_common_tokens,0) desc </sql> </report> @@ -1004,7 +1004,7 @@ from exceptions_a ea left join exceptions_b eb on ea.id = eb.id join profiles_a pa on pa.id=ea.id - join profiles_b pb on pb.id=pa.id //this ensures that files were actually processed in both runs + join profiles_b pb on pb.id=pa.id --this ensures that files were actually processed in both runs join containers c on pa.container_id=c.container_id join mimes ma on ma.mime_id=pa.mime_id join mimes mb on mb.mime_id=pb.mime_id @@ -1211,7 +1211,7 @@ from exceptions_a e join ref_parse_exception_types t on t.parse_exception_id=e.parse_exception_id - group by e.parse_exception_id + group by t.parse_exception_description </sql> </report> <report reportName="parseExceptionTypesB" @@ -1223,7 +1223,7 @@ from exceptions_b e join ref_parse_exception_types t on t.parse_exception_id=e.parse_exception_id - group by e.parse_exception_id + group by t.parse_exception_description </sql> </report> @@ -1245,8 +1245,8 @@ ca.num_common_tokens as NUM_COMMON_TOKENS_A, cb.common_tokens_lang as COMMON_TOKENS_LANG_B, cb.num_common_tokens as NUM_COMMON_TOKENS_B, - ifnull(cb.num_common_tokens,0)- - ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B, + coalesce(cb.num_common_tokens,0)- + coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B, ca.top_n_tokens as TOP_N_TOKENS_A, cb.top_n_tokens as TOP_N_TOKENS_B, ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A, @@ -1295,8 +1295,8 @@ ca.num_common_tokens as NUM_COMMON_TOKENS_A, cb.common_tokens_lang as COMMON_TOKENS_LANG_B, cb.num_common_tokens as NUM_COMMON_TOKENS_B, - ifnull(cb.num_common_tokens,0)- - ifnull(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B, + coalesce(cb.num_common_tokens,0)- + coalesce(ca.num_common_tokens, 0) as NUM_COMMON_TOKENS_DIFF_IN_B, ca.top_n_tokens as TOP_N_TOKENS_A, cb.top_n_tokens as TOP_N_TOKENS_B, ca.unicode_char_blocks as UNICODE_CHAR_BLOCKS_A, @@ -1332,7 +1332,7 @@ num_tokens_a, num_tokens_b, num_alphabetic_tokens_a, num_alphabetic_tokens_b, num_common_tokens_a, num_common_tokens_b, - ifnull(num_common_tokens_b, 0)-ifnull(num_common_tokens_a, 0) as change_in_common_tokens_b + coalesce(num_common_tokens_b, 0)-coalesce(num_common_tokens_a, 0) as change_in_common_tokens_b from token_counts_compared tcc join mimes ma on tcc.mime_id_a = ma.mime_id join mimes mb on tcc.mime_id_b = mb.mime_id @@ -1722,6 +1722,7 @@ from parse_time_compared ptc join mimes ma on ptc.mime_id_a=ma.mime_id join mimes mb on ptc.mime_id_b=mb.mime_id + where TOTAL_A > 1000 AND TOTAL_B > 1000 -- only show comparisons if > a second order by prcnt_increase desc </sql> </report> diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml index e84454a..e23ec5e 100644 --- a/tika-eval/src/main/resources/comparison-reports.xml +++ b/tika-eval/src/main/resources/comparison-reports.xml @@ -704,7 +704,7 @@ group by mime_id_a, mime_id_b) </sql> <sql> - update parse_time_compared ptc set prcnt_increase=( + update parse_time_compared ptc set prcnt_increase=(100.0 * cast(total_b as decimal)/cast(total_a as decimal)) where total_a > 0; </sql> @@ -1722,6 +1722,7 @@ from parse_time_compared ptc join mimes ma on ptc.mime_id_a=ma.mime_id join mimes mb on ptc.mime_id_b=mb.mime_id + where TOTAL_A > 1000 AND TOTAL_B > 1000 -- only show comparisons if > a second order by prcnt_increase desc </sql> </report>
