This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit b6e7af41dc12706443bc2a2ec8d72ba4a3a16a67 Author: tallison <[email protected]> AuthorDate: Thu Dec 16 10:28:54 2021 -0500 TIKA-3613 -- general updates for 1.28 --- tika-eval/pom.xml | 2 +- .../src/main/resources/comparison-reports-pg.xml | 2 + .../src/main/resources/comparison-reports.xml | 627 +++------------------ tika-eval/src/main/resources/db.properties | 1 + tika-eval/src/main/resources/profile-reports.xml | 67 +-- tika-parent/pom.xml | 4 +- 6 files changed, 86 insertions(+), 617 deletions(-) diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml index 45702b0..dc9876a 100644 --- a/tika-eval/pom.xml +++ b/tika-eval/pom.xml @@ -66,7 +66,7 @@ <dependency> <groupId>com.h2database</groupId> <artifactId>h2</artifactId> - <version>1.4.200</version> + <version>2.0.202</version> </dependency> <dependency> <groupId>commons-cli</groupId> diff --git a/tika-eval/src/main/resources/comparison-reports-pg.xml b/tika-eval/src/main/resources/comparison-reports-pg.xml index 5bcf88e..5940f46 100644 --- a/tika-eval/src/main/resources/comparison-reports-pg.xml +++ b/tika-eval/src/main/resources/comparison-reports-pg.xml @@ -19,6 +19,8 @@ under the License. --> +<!-- this should be the same as comparison-reports.xml but + translated from the H2 dialect to postgres --> <reports> diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml index b31f46e..8aa3363 100644 --- a/tika-eval/src/main/resources/comparison-reports.xml +++ b/tika-eval/src/main/resources/comparison-reports.xml @@ -107,12 +107,12 @@ </sql> <sql> update exceptions_compared - set exc_prcnt_a = cast(exc_cnt_a as decimal)/cast(total as decimal) + set exc_prcnt_a = cast(exc_cnt_a as float)/cast(total as float) where total > 0; </sql> <sql> update exceptions_compared - set exc_prcnt_b = cast(exc_cnt_b as decimal)/cast(total as decimal) + set exc_prcnt_b = cast(exc_cnt_b as float)/cast(total as float) where total > 0; </sql> @@ -217,454 +217,6 @@ group by mime_id_a, mime_id_b ); </sql> - - <sql>drop table if exists tags_by_mime</sql> - <sql>create table tags_by_mime ( - mime_id_a integer, - mime_id_b integer, - tags_a_a integer, - tags_b_a integer, - tags_div_a integer, - tags_i_a integer, - tags_img_a integer, - tags_li_a integer, - tags_ol_a integer, - tags_p_a integer, - tags_table_a integer, - tags_td_a integer, - tags_title_a integer, - tags_tr_a integer, - tags_u_a integer, - tags_ul_a integer, - tags_a_b integer, - tags_b_b integer, - tags_div_b integer, - tags_i_b integer, - tags_img_b integer, - tags_li_b integer, - tags_ol_b integer, - tags_p_b integer, - tags_table_b integer, - tags_td_b integer, - tags_title_b integer, - tags_tr_b integer, - tags_u_b integer, - tags_ul_b integer - ); - </sql> - <sql> - insert into tags_by_mime (mime_id_a, mime_id_b) - select ma.mime_id, mb.mime_id - from profiles_a a - join profiles_b b on a.id=b.id - join mimes ma on ma.mime_id=a.mime_id - join mimes mb on mb.mime_id=b.mime_id - group by ma.mime_id, mb.mime_id - </sql> - <sql> - update tags_by_mime tbm set tags_a_a=( - select sum(ta.tags_a) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_b_a=( - select sum(ta.tags_b) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_div_a=( - select sum(ta.tags_div) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_i_a=( - select sum(ta.tags_i) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_img_a=( - select sum(ta.tags_img) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_li_a=( - select sum(ta.tags_li) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_ol_a=( - select sum(ta.tags_ol) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_p_a=( - select sum(ta.tags_p) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_table_a=( - select sum(ta.tags_table) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_td_a=( - select sum(ta.tags_td) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_title_a=( - select sum(ta.tags_title) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_tr_a=( - select sum(ta.tags_tr) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_u_a=( - select sum(ta.tags_u) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_ul_a=( - select sum(ta.tags_ul) as cnt from tags_a ta - join tags_b tb on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <!-- now update tags_b counts --> - <sql> - update tags_by_mime tbm set tags_a_b=( - select sum(tb.tags_a) as cnt from tags_b tb - join tags_a ta on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_b_b=( - select sum(tb.tags_b) as cnt from tags_b tb - join tags_a ta on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_div_b=( - select sum(tb.tags_div) as cnt from tags_b tb - join tags_a ta on tb.id=ta.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_i_b=( - select sum(tb.tags_i) as cnt from tags_b tb - join tags_a ta on tb.id=ta.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_img_b=( - select sum(tb.tags_img) as cnt from tags_b tb - join tags_a ta on tb.id=ta.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_li_b=( - select sum(tb.tags_li) as cnt from tags_b tb - join tags_a ta on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_ol_b=( - select sum(tb.tags_ol) as cnt from tags_b tb - join tags_a ta on tb.id=ta.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_p_b=( - select sum(tb.tags_p) as cnt from tags_b tb - join tags_a ta on tb.id=ta.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_table_b=( - select sum(tb.tags_table) as cnt from tags_b tb - join tags_a ta on tb.id=ta.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_td_b=( - select sum(tb.tags_td) as cnt from tags_b tb - join tags_a ta on tb.id=ta.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_title_b=( - select sum(tb.tags_title) as cnt from tags_b tb - join tags_a ta on tb.id=ta.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_tr_b=( - select sum(tb.tags_tr) as cnt from tags_b tb - join tags_a ta on tb.id=ta.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_u_b=( - select sum(tb.tags_u) as cnt from tags_b tb - join tags_a ta on tb.id=ta.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tags_by_mime tbm set tags_ul_b=( - select sum(tb.tags_ul) as cnt from tags_b tb - join tags_a ta on ta.id=tb.id - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tbm.mime_id_b - and pa.mime_id=tbm.mime_id_a - and ta.tags_parse_exception=false - and tb.tags_parse_exception=false - group by mime_id_a, mime_id_b - ); - </sql> - <sql>drop table if exists tag_exceptions_by_mime</sql> - <sql>create table tag_exceptions_by_mime ( - mime_id_a integer, - mime_id_b integer, - tag_exceptions_a integer, - tag_exceptions_b integer) - </sql> - <sql> - insert into tag_exceptions_by_mime (mime_id_a, mime_id_b, - tag_exceptions_a, tag_exceptions_b) - select ma.mime_id, mb.mime_id,0,0 - from profiles_a a - join profiles_b b on a.id=b.id - join mimes ma on ma.mime_id=a.mime_id - join mimes mb on mb.mime_id=b.mime_id - group by ma.mime_id, mb.mime_id - </sql> - <sql> - update tag_exceptions_by_mime tebm set tag_exceptions_a=( - select count(1) as cnt from tags_a ta - join profiles_a pa on pa.id=ta.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tebm.mime_id_b - and pa.mime_id=tebm.mime_id_a - and ta.tags_parse_exception=true - group by mime_id_a, mime_id_b - ); - </sql> - <sql> - update tag_exceptions_by_mime tebm set tag_exceptions_b=( - select count(1) as cnt from tags_b tb - join profiles_a pa on pa.id=tb.id - join profiles_b pb on pa.id=pb.id - where pb.mime_id= tebm.mime_id_b - and pa.mime_id=tebm.mime_id_a - and tb.tags_parse_exception=true - group by mime_id_a, mime_id_b - ); - </sql> <sql> drop table if exists parse_time_compared; </sql> @@ -705,7 +257,7 @@ </sql> <sql> update parse_time_compared ptc set prcnt_increase=(100.0 * - cast(total_b as decimal)/cast(total_a as decimal)) + cast(total_b as double)/cast(total_a as double)) where total_a > 0; </sql> </before> @@ -1202,6 +754,35 @@ on e.extract_exception_id=t.extract_exception_id </sql> </report> + <report reportName="fixedCatastrophicExtractExceptions" + reportFilename="exceptions/fixed_catastrophic_exceptions_in_b.xlsx" + format="xlsx" + includeSql="true"> + <sql> + select exa.file_path, ra.extract_exception_description, rb.extract_exception_description + from extract_exceptions_a exa + left join extract_exceptions_b exb on exa.container_id=exb.container_id + join ref_extract_exception_types ra on exa.extract_exception_id = ra.extract_exception_id + left join ref_extract_exception_types rb on exb.extract_exception_id = rb.extract_exception_id + where exa.extract_exception_id < 4 + and (exb.extract_exception_id is null or exb.extract_exception_id > 3) + </sql> + </report> + <report reportName="newCatastrophicExtractExceptions" + reportFilename="exceptions/new_catastrophic_exceptions_in_b.xlsx" + format="xlsx" + includeSql="true"> + <sql> + select exb.file_path, rb.extract_exception_description, ra.extract_exception_description + from extract_exceptions_b exb + left join extract_exceptions_a exa on exb.container_id=exa.container_id + join ref_extract_exception_types rb on exb.extract_exception_id = rb.extract_exception_id + left join ref_extract_exception_types ra on exa.extract_exception_id = ra.extract_exception_id + where exb.extract_exception_id < 4 + and (exa.extract_exception_id is null or exa.extract_exception_id > 3) + </sql> + </report> + <report reportName="parseExceptionTypesA" reportFilename="exceptions/overall_exception_types_a.xlsx" format="xlsx" @@ -1227,6 +808,51 @@ </sql> </report> + <report reportName="DiceQuintiles" + reportFilename="content/dice_quintiles.xlsx" + format="xlsx" + includeSql="true"> + <sql> + SELECT + case + when dice_coefficient < 0.20 then '0<0.20%' + when dice_coefficient < 0.40 then '20%<40%' + when dice_coefficient < 0.60 then '40%<60%' + when dice_coefficient < 0.80 then '60%<80%' + when dice_coefficient < 1.01 then '80%-100%' + else 'other' + end as range, + COUNT(*) AS COUNT, + FROM content_comparisons cc + join contents_a ca on cc.id = ca.id + join contents_b cb on cc.id = cb.id + where ca.num_tokens > 10 or cb.num_tokens > 10 + GROUP BY range + </sql> + </report> + + <report reportName="OverlapQuintiles" + reportFilename="content/overlap_quintiles.xlsx" + format="xlsx" + includeSql="true"> + <sql> + SELECT + case + when overlap < 0.20 then '0<0.20%' + when overlap < 0.40 then '20%<40%' + when overlap < 0.60 then '40%<60%' + when overlap < 0.80 then '60%<80%' + when overlap < 1.01 then '80%-100%' + else 'other' + end as range, + COUNT(*) AS COUNT, + FROM content_comparisons cc + join contents_a ca on cc.id = ca.id + join contents_b cb on cc.id = cb.id + where ca.num_tokens > 10 or cb.num_tokens > 10 + GROUP BY range + </sql> + </report> <report reportName="contentDiffsWExceptions" reportFilename="content/content_diffs_with_exceptions.xlsx" format="xlsx" @@ -1532,17 +1158,6 @@ order by cnt desc </sql> </report> - <report reportName="Container files missing in B details" - reportFilename="attachments/container_files_missing_in_B_details.xlsx" - format="xlsx" - includeSql="true"> - <sql> - select pa.file_name from profiles_a pa - left join profiles_b pb on pa.id=pb.id - where pb.id is null - and pa.is_embedded = false - </sql> - </report> <report reportName="Embedded files missing in B by Mime" reportFilename="attachments/embedded_files_missing_in_B_by_mime.xlsx" format="xlsx" @@ -1633,94 +1248,6 @@ limit 100000 </sql> </report> - <report reportName="Tag Count Diffs By Mime" - reportFilename="tags/tag_count_diffs_by_mime.xlsx" - format="xlsx" - includeSql="true"> - <sql> - select ma.mime_string as mime_string_a, - mb.mime_string as mime_string_b, - tags_a_a, - tags_a_b, - tags_b_a, - tags_b_b, - tags_div_a, - tags_div_b, - tags_i_a, - tags_i_b, - tags_li_a, - tags_li_b, - tags_ol_a, - tags_ol_b, - tags_p_a, - tags_p_b, - tags_table_a, - tags_table_b, - tags_td_a, - tags_td_b, - tags_title_a, - tags_title_b, - tags_tr_a, - tags_tr_b, - tags_u_a, - tags_u_b, - tags_ul_a, - tags_ul_b - from - tags_by_mime tbm - join mimes ma on tbm.mime_id_a=ma.mime_id - join mimes mb on tbm.mime_id_b=mb.mime_id - limit 100000 - </sql> - - </report> - <report reportName="Tag Exceptions By Mime" - reportFilename="tags/tag_exceptions_by_mime.xlsx" - format="xlsx" - includeSql="true"> - <sql> - select ma.mime_string as mime_string_a, - mb.mime_string as mime_string_b, - tag_exceptions_a, - tag_exceptions_b, - (tag_exceptions_b-tag_exceptions_a) as diff_tag_exceptions_in_b - from tag_exceptions_by_mime tebm - join mimes ma on tebm.mime_id_a=ma.mime_id - join mimes mb on tebm.mime_id_b=mb.mime_id - order by diff_tag_exceptions_in_b desc - </sql> - </report> - <report reportName="Tag Exceptions Details A" - reportFilename="tags/tag_exceptions_details_a.xlsx" - format="xlsx" - includeSql="true"> - <sql> - select c.file_path,pa.file_name,mime_string,is_embedded from - tags_a ta - join profiles_a pa on ta.id=pa.id - join containers c on pa.container_id=c.container_id - join mimes m on pa.mime_id=m.mime_id - where ta.tags_parse_exception=true - order by m.mime_string - limit 20000 - </sql> - </report> - <report reportName="Tag Exceptions Details B" - reportFilename="tags/tag_exceptions_details_b.xlsx" - format="xlsx" - includeSql="true"> - <sql> - select c.file_path,pb.file_name,mime_string,is_embedded from - tags_b tb - join profiles_b pb on tb.id=pb.id - join containers c on pb.container_id=c.container_id - join mimes m on pb.mime_id=m.mime_id - where tb.tags_parse_exception=true - order by m.mime_string - limit 20000 - </sql> - </report> - <report reportName="Parse Time (Millis) Compared" reportFilename="parse_times/parse_time_millis_by_mime_compared.xlsx" format="xlsx" diff --git a/tika-eval/src/main/resources/db.properties b/tika-eval/src/main/resources/db.properties index 548a3c4..a35e35a 100644 --- a/tika-eval/src/main/resources/db.properties +++ b/tika-eval/src/main/resources/db.properties @@ -1,3 +1,4 @@ +# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml index 028a7f4..db74481 100644 --- a/tika-eval/src/main/resources/profile-reports.xml +++ b/tika-eval/src/main/resources/profile-reports.xml @@ -122,7 +122,7 @@ num_tokens, num_alphabetic_tokens, num_common_tokens, case when num_alphabetic_tokens > 0 - then cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) + then cast(num_common_tokens as double)/cast(num_alphabetic_tokens as double) else 0 end as common_div_alphabetic from contents c @@ -131,7 +131,7 @@ join mimes m on p.mime_id=m.mime_id where (num_alphabetic_tokens = 0 - or cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as decimal) < 0.50 + or cast(num_common_tokens as double)/cast(num_alphabetic_tokens as double) < 0.50 ) and mime_string not like 'image%' and mime_string not like 'video%' @@ -159,7 +159,7 @@ when num_tokens = 0 then 0 else - cast(num_tokens as decimal)/cast(num_pages as decimal) + cast(num_tokens as double)/cast(num_pages as double) end as num_tokens_div_num_pages from profiles p left join contents c on p.id=c.id @@ -260,67 +260,6 @@ CONTAINER_LENGTH asc </sql> </report> - <report reportName="TagExceptionsByMime" - reportFilename="tags/tag_exceptions_by_mime.xlsx" - format="xlsx" - includeSql="true"> - - <sql> - select mime_string, count(1) as CNT - from tags t - join profiles p on p.id=t.id - join mimes m on p.mime_id=m.mime_id - where tags_parse_exception=TRUE - group by mime_string - order by CNT desc - </sql> - </report> - <report reportName="Tag Exceptions Details" - reportFilename="tags/tag_exceptions_details.xlsx" - format="xlsx" - includeSql="true"> - - <sql> - select c.file_path,p.file_name,mime_string,is_embedded from - tags t - join profiles p on t.id=p.id - join containers c on p.container_id=c.container_id - join mimes m on p.mime_id=m.mime_id - where t.tags_parse_exception=true - order by m.mime_string - limit 20000 - </sql> - </report> - <report reportName="Tags by Mime" - reportFilename="tags/tags_by_mime.xlsx" - format="xlsx" - includeSql="true"> - - <sql> - select mime_string, - sum(tags_a) as tags_a, - sum(tags_b) as tags_b, - sum(tags_div) as tags_div, - sum(tags_i) as tags_i, - sum(tags_img) as tags_img, - sum(tags_li) as tags_li, - sum(tags_ol) as tags_ol, - sum(tags_p) as tags_p, - sum(tags_table) as tags_table, - sum(tags_td) as tags_td, - sum(tags_title) as tags_title, - sum(tags_tr) as tags_tr, - sum(tags_u) as tags_u, - sum(tags_ul) as tags_ul - - from tags t - join profiles p on t.id=p.id - join mimes m on p.mime_id=m.mime_id - where tags_parse_exception=false - group by m.mime_id - </sql> - - </report> <after> <!--<sql>drop index on x</sql> diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index d375b93..4045278 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -282,7 +282,7 @@ <osgi.core.version>6.0.0</osgi.core.version> <cxf.version>3.4.5</cxf.version> - <slf4j.version>1.7.30</slf4j.version> + <slf4j.version>1.7.32</slf4j.version> <log4j2.version>2.16.0</log4j2.version> <jackson.version>2.13.0</jackson.version> <!-- when this is next upgraded, see if we can get rid of @@ -291,7 +291,7 @@ See TIKA-3407 --> <jaxb.version>2.3.5</jaxb.version> <cli.version>1.5.0</cli.version> - <lucene.version>8.9.0</lucene.version> + <lucene.version>8.11.0</lucene.version> <mockito.version>3.11.2</mockito.version> <lombok.version>1.18.20</lombok.version> <opennlp.version>1.9.4</opennlp.version>
