This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 2bcc0a7  TIKA-2268 -- add report for common_tokens/alphabetic tokens
2bcc0a7 is described below

commit 2bcc0a7a791227fe50ee5707f3d519cf60160825
Author: tballison <talli...@mitre.org>
AuthorDate: Thu Aug 10 14:06:13 2017 -0400

    TIKA-2268 -- add report for common_tokens/alphabetic tokens
---
 tika-eval/src/main/resources/profile-reports.xml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tika-eval/src/main/resources/profile-reports.xml 
b/tika-eval/src/main/resources/profile-reports.xml
index 87642fd..0a7bb4d 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -110,6 +110,24 @@
             order by cnt desc;
         </sql>
     </report>
+
+    <report reportName="Common Tokens Divided by Alphabetic Tokens"
+            reportFilename="content/common_tokens_div_alphabetic.xlsx"
+            format="xlsx"
+            includeSql="true">
+        <!-- 0.50 is a complete heuristic -->
+        <sql>
+            select file_path, lang_id_1, common_tokens_lang,
+            num_tokens, num_alphabetic_tokens, num_common_tokens,
+            cast(num_common_tokens as decimal)/cast(num_alphabetic_tokens as 
decimal) common_div_alphabetic
+            from contents c
+            join profiles p on p.id=c.id
+            join containers ct on ct.container_id=p.container_id
+            where cast(num_common_tokens as 
decimal)/cast(num_alphabetic_tokens as decimal) &lt; 0.50
+            order by common_div_alphabetic asc
+        </sql>
+    </report>
+
     <report reportName="Exceptions by Type"
             reportFilename="exceptions/exceptions_by_type.xlsx"
             format="xlsx"

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <commits@tika.apache.org>'].

Reply via email to