This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new e31c933 TIKA-2318 -- include container file length in reports
that mention file path, and add a report that compares page count.
new 2ab94fe Merge remote-tracking branch 'origin/master'
e31c933 is described below
commit e31c933c153c76fc1048dbb8ed73757b97639764
Author: tballison <[email protected]>
AuthorDate: Tue May 9 15:02:19 2017 -0400
TIKA-2318 -- include container file length in reports that mention file
path, and add a report that compares page count.
---
.../src/main/resources/comparison-reports.xml | 83 ++++++++++++++++------
1 file changed, 63 insertions(+), 20 deletions(-)
diff --git a/tika-eval/src/main/resources/comparison-reports.xml
b/tika-eval/src/main/resources/comparison-reports.xml
index 512ac6a..c0e084e 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -332,7 +332,10 @@
<sql>
select concat(ma.mime_string, ' -> ', mb.mime_string) as
- MIME_A_TO_MIME_B, file_path, a.file_name
+ MIME_A_TO_MIME_B,
+ file_path,
+ c.length as CONTAINER_LENGTH,
+ a.file_name
from profiles_a a
join profiles_b b on a.id=b.id
join mimes ma on ma.mime_id=a.mime_id
@@ -451,7 +454,9 @@
includeSql="true">
<sql>
- select file_path as FILE_PATH, ca.NUM_TOKENS as NUM_TOKENS_A,
+ select file_path as FILE_PATH,
+ c.length as CONTAINER_LENGTH,
+ ca.NUM_TOKENS as NUM_TOKENS_A,
cb.NUM_TOKENS as NUM_TOKENS_B,
ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A, cb.NUM_UNIQUE_TOKENS
as NUM_UNIQUE_TOKENS_B,
@@ -497,8 +502,11 @@
format="xlsx"
includeSql="true">
<sql>
- select mime_string as MIME_TYPE,
- file_path, pa.file_name, pa.is_embedded
+ select
+ file_path,
+ c.length as CONTAINER_LENGTH,
+ mime_string as MIME_TYPE,
+ pa.file_name, pa.is_embedded
from exceptions_a ea
left join exceptions_b eb on ea.id = eb.id
join profiles_a pa on pa.id=ea.id
@@ -516,7 +524,9 @@
includeSql="true">
<sql>
- select file_path, mime_string as MIME_TYPE,
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ mime_string as MIME_TYPE,
CONTENT_LENGTH,
NUM_TOKENS, NUM_UNIQUE_TOKENS,
TOP_N_TOKENS, LANG_ID_1,TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV
@@ -576,7 +586,9 @@
includeSql="true">
<sql>
- select file_path, MIME_STRING as MIME_TYPE, p.length,
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ mime_string as MIME_TYPE,
eb.orig_stack_trace, eb.sort_stack_trace
from exceptions_b eb
left join exceptions_a ea on ea.id = eb.id
@@ -612,7 +624,9 @@
includeSql="true">
<sql>
- select file_path, c.length as FILE_LENGTH, MIME_STRING as
MIME_TYPE,
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ mime_string as MIME_TYPE,
orig_stack_trace, sort_stack_trace
from exceptions_a e
join profiles_a p on p.id=e.id
@@ -620,7 +634,7 @@
join mimes m on m.mime_id=p.mime_id
and e.parse_exception_id=0
order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
- FILE_LENGTH asc
+ CONTAINER_LENGTH asc
</sql>
</report>
<report reportName="AllStackTracesInB"
@@ -629,7 +643,9 @@
includeSql="true">
<sql>
- select file_path, c.length as FILE_LENGTH, MIME_STRING as
MIME_TYPE,
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ mime_string as MIME_TYPE,
orig_stack_trace, sort_stack_trace
from exceptions_b e
join profiles_b p on p.id=e.id
@@ -637,7 +653,7 @@
join mimes m on m.mime_id=p.mime_id
and e.parse_exception_id=0
order by MIME_TYPE asc, sort_stack_trace, orig_stack_trace,
- FILE_LENGTH asc
+ CONTAINER_LENGTH asc
</sql>
</report>
@@ -711,8 +727,9 @@
<sql>
select file_path,
- ma.mime_string as mime_string_a,
- mb.mime_string as mime_string_b,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
ca.num_unique_tokens as NUM_UNIQUE_TOKENS_A,
cb.num_unique_tokens as NUM_UNIQUE_TOKENS_B,
ca.num_tokens as NUM_TOKENS_A,
@@ -756,8 +773,9 @@
<sql>
select file_path,
- ma.mime_string as mime_string_a,
- mb.mime_string as mime_string_b,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
ca.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_A,
cb.NUM_UNIQUE_TOKENS as NUM_UNIQUE_TOKENS_B,
ca.NUM_TOKENS as NUM_TOKENS_A,
@@ -810,6 +828,30 @@
order by change_in_common_tokens_b desc
</sql>
</report>
+ <report reportName="PageCountDiffs"
+ reportFilename="content/page_count_diffs.xlsx"
+ format="xlsx"
+ includeSql="true">
+
+ <sql>
+ select file_path,
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
+ pa.num_pages as NUM_PAGES_A,
+ pb.num_pages as NUM_PAGES_B,
+ (pb.num_pages-pa.num_pages) as DIFF_NUM_PAGES_IN_B
+ from profiles_a pa
+ join profiles_b pb on pa.id = pb.id
+ join containers c on pa.container_id=c.container_id
+ join mimes ma on ma.mime_id=pa.mime_id
+ join mimes mb on mb.mime_id=pb.mime_id
+ where pa.num_pages is not null
+ and pb.num_pages is not null
+ and pa.num_pages <> pb.num_pages
+ order by DIFF_NUM_PAGES_IN_B asc;
+ </sql>
+ </report>
<report reportName="ExceptionComparisonsByMimeType"
@@ -892,12 +934,13 @@
<sql>
select file_path,
- ma.mime_string as mime_string_a,
- mb.mime_string as mime_string_b,
- pa.num_attachments as num_attachments_a,
- pb.num_attachments as num_attachments_b,
- ea.parse_exception_id as exception_id_a,
- eb.parse_exception_id as exception_id_b
+ c.length as CONTAINER_LENGTH,
+ ma.mime_string as MIME_STRING_A,
+ mb.mime_string as MIME_STRING_B,
+ pa.num_attachments as NUM_ATTACHMENTS_A,
+ pb.num_attachments as NUM_ATTACHMENTS_B,
+ ea.parse_exception_id as EXCEPTION_ID_A,
+ eb.parse_exception_id as EXCEPTION_ID_B
from profiles_a pa
join profiles_b pb on pa.id= pb.id
join containers c on pa.container_id=c.container_id
--
To stop receiving notification emails like this one, please contact
['"[email protected]" <[email protected]>'].