This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push: new fab1954 TIKA-2827 -- include both mime_a and mime_b more often in comparison diff reports fab1954 is described below commit fab1954ed7b2ef8964477a821dc6bc22414592f6 Author: TALLISON <talli...@apache.org> AuthorDate: Thu Feb 14 09:13:38 2019 -0500 TIKA-2827 -- include both mime_a and mime_b more often in comparison diff reports --- .../src/main/resources/comparison-reports.xml | 70 ++++++++++++++-------- 1 file changed, 46 insertions(+), 24 deletions(-) diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml index 7c20ffd..48c3523 100644 --- a/tika-eval/src/main/resources/comparison-reports.xml +++ b/tika-eval/src/main/resources/comparison-reports.xml @@ -930,16 +930,20 @@ includeSql="true"> <sql> - select mime_string as MIME_TYPE, count(1) as COUNT + select + ma.mime_string as MIME_TYPE_A, + mb.mime_string as MIME_TYPE_B, + count(1) as COUNT from exceptions_a ea left join exceptions_b eb on ea.id = eb.id join profiles_a pa on pa.id=ea.id join profiles_b pb on pa.id=pb.id join containers c on pa.container_id=c.container_id - join mimes m on m.mime_id=pa.mime_id + join mimes ma on ma.mime_id=pa.mime_id + join mimes mb on mb.mime_id=pb.mime_id where eb.id is null and ea.parse_exception_id=0 - group by mime_string + group by mime_type_a, mime_type_b </sql> </report> @@ -951,17 +955,19 @@ select file_path, c.length as CONTAINER_LENGTH, - mime_string as MIME_TYPE, + ma.mime_string as MIME_TYPE_A, + mb.mime_string as MIME_TYPE_B, pa.file_name, pa.is_embedded from exceptions_a ea left join exceptions_b eb on ea.id = eb.id join profiles_a pa on pa.id=ea.id join profiles_b pb on pb.id=pa.id //this ensures that files were actually processed in both runs join containers c on pa.container_id=c.container_id - join mimes m on m.mime_id=pa.mime_id + join mimes ma on ma.mime_id=pa.mime_id + join mimes mb on mb.mime_id=pb.mime_id where eb.id is null and ea.parse_exception_id=0 - order by mime_string + order by mime_type_a, mime_type_b </sql> </report> <report reportName="ContentsOfFixedExceptionsInB" @@ -972,16 +978,19 @@ <sql> select file_path, c.length as CONTAINER_LENGTH, - mime_string as MIME_TYPE, + ma.mime_string as MIME_TYPE_A, + mb.mime_string as MIME_TYPE_B, CONTENT_LENGTH, NUM_TOKENS, NUM_UNIQUE_TOKENS, TOP_N_TOKENS, LANG_ID_1,TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV from exceptions_a ea left join exceptions_b eb on ea.id = eb.id - join profiles_a p on p.id=ea.id + join profiles_a pa on pa.id=ea.id + join profiles_b pb on pa.id=pb.id join contents_b cb on cb.id=ea.id - join containers c on p.container_id=c.container_id - join mimes m on m.mime_id=p.mime_id + join containers c on pa.container_id=c.container_id + join mimes ma on ma.mime_id=pa.mime_id + join mimes mb on mb.mime_id=pb.mime_id where eb.id is null and ea.parse_exception_id=0 </sql> @@ -993,16 +1002,17 @@ includeSql="true"> <sql> - select mime_string as MIME_TYPE_A, count(1) as COUNT + select ma.mime_string as MIME_TYPE_A, mb.mime_string as MIME_TYPE_B, count(1) as COUNT from exceptions_b eb left join exceptions_a ea on ea.id = eb.id join profiles_a pa on pa.id=eb.id join profiles_b pb on pb.id=pa.id join containers c on pa.container_id=c.container_id - join mimes m on m.mime_id=pa.mime_id + join mimes ma on ma.mime_id=pa.mime_id + join mimes mb on mb.mime_id=pb.mime_id where ea.id is null and eb.parse_exception_id=0 - group by mime_string + group by ma.mime_string, mb.mime_string order by COUNT desc </sql> </report> @@ -1013,16 +1023,21 @@ includeSql="true"> <sql> - select MIME_STRING as MIME_TYPE, eb.sort_stack_trace, count(1) as + select + ma.MIME_STRING as MIME_TYPE_A, + mb.MIME_STRING as MIME_TYPE_B, + eb.sort_stack_trace, count(1) as COUNT from exceptions_b eb left join exceptions_a ea on ea.id = eb.id - join profiles_a p on p.id=eb.id - join mimes m on m.mime_id=p.mime_id + join profiles_a pa on pa.id=eb.id + join profiles_b pb on pb.id=eb.id + join mimes ma on ma.mime_id=pa.mime_id + join mimes mb on mb.mime_id=pb.mime_id where ea.id is null and eb.parse_exception_id=0 - group by MIME_TYPE, eb.sort_stack_trace - order by MIME_TYPE asc, COUNT desc + group by MIME_TYPE_A, MIME_TYPE_B, eb.sort_stack_trace + order by MIME_TYPE_A asc, MIME_TYPE_B asc, COUNT desc </sql> </report> @@ -1034,16 +1049,19 @@ <sql> select file_path, c.length as CONTAINER_LENGTH, - mime_string as MIME_TYPE, + ma.mime_string as MIME_TYPE_A, + mb.mime_string as MIME_TYPE_B, eb.orig_stack_trace, eb.sort_stack_trace from exceptions_b eb left join exceptions_a ea on ea.id = eb.id - join profiles_a p on p.id=eb.id - join containers c on p.container_id=c.container_id - join mimes m on m.mime_id=p.mime_id + join profiles_a pa on pa.id=eb.id + join profiles_b pb on pb.id=eb.id + join containers c on pa.container_id=c.container_id + join mimes ma on ma.mime_id=pa.mime_id + join mimes mb on mb.mime_id=pb.mime_id where ea.id is null and eb.parse_exception_id=0 - order by MIME_TYPE asc, eb.ORIG_STACK_TRACE + order by MIME_TYPE_A asc, MIME_TYPE_B asc, eb.ORIG_STACK_TRACE </sql> </report> @@ -1192,7 +1210,9 @@ cb.unicode_char_blocks as UNICODE_CHAR_BLOCKS_B, top_10_unique_token_diffs_a, top_10_unique_token_diffs_b, - top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap + top_10_more_in_a, top_10_more_in_b, dice_coefficient, overlap, + ref_ea.parse_exception_description as EXCEPTION_A, + ref_eb.parse_exception_description as EXCEPTION_B from content_comparisons cc join contents_a ca on ca.id=cc.id left join contents_b cb on cb.id=cc.id @@ -1203,6 +1223,8 @@ join mimes mb on mb.mime_id=pb.mime_id left join exceptions_a ea on ea.id=cc.id left join exceptions_b eb on eb.id=cc.id + left join ref_parse_exception_types ref_ea on ref_ea.parse_exception_id=ea.parse_exception_id + left join ref_parse_exception_types ref_eb on ref_eb.parse_exception_id=eb.parse_exception_id where (overlap < 0.95 or abs(ca.NUM_TOKENS-cb.NUM_TOKENS) >30) and (ea.parse_exception_id is null or ea.parse_exception_id <> 2)