Repository: tika
Updated Branches:
  refs/heads/master aa7a0c353 -> 506b57256


TIKA-1332 -- fix one report for eval profiler and clean up whitespace


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/506b5725
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/506b5725
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/506b5725

Branch: refs/heads/master
Commit: 506b572560f6c7f44270b55877f110719a7d4b1f
Parents: aa7a0c3
Author: tballison <talli...@mitre.org>
Authored: Thu Feb 16 12:33:58 2017 -0500
Committer: tballison <talli...@mitre.org>
Committed: Thu Feb 16 12:33:58 2017 -0500

----------------------------------------------------------------------
 .../src/main/resources/comparison-reports.xml   |  2 +-
 .../src/main/resources/lucene-analyzers.json    | 30 +++------
 .../src/main/resources/profile-reports.xml      | 11 ++--
 .../resources/tika-eval-comparison-config.xml   | 65 ++++++++++----------
 ...ingle-file-profiler-crawl-extract-config.xml |  2 +-
 .../single-file-profiler-crawl-input-config.xml |  2 +-
 6 files changed, 52 insertions(+), 60 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/comparison-reports.xml
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/resources/comparison-reports.xml 
b/tika-eval/src/main/resources/comparison-reports.xml
index cb7befd..d69cb2a 100644
--- a/tika-eval/src/main/resources/comparison-reports.xml
+++ b/tika-eval/src/main/resources/comparison-reports.xml
@@ -206,7 +206,7 @@
         </sql>
     </report>
 
-       <report reportName="Mime Differences A -> B Details"
+    <report reportName="Mime Differences A -> B Details"
             reportFilename="mimes/mime_diffs_A_to_B_details.xlsx"
             format="xlsx"
             includeSql="true">

http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/lucene-analyzers.json
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/resources/lucene-analyzers.json 
b/tika-eval/src/main/resources/lucene-analyzers.json
index 268494f..f7141f7 100644
--- a/tika-eval/src/main/resources/lucene-analyzers.json
+++ b/tika-eval/src/main/resources/lucene-analyzers.json
@@ -1,12 +1,11 @@
 {
   "analyzers": {
-    "general" :
-    {
+    "general": {
       "charfilters": [
         {
           "factory": "oala.charfilter.MappingCharFilterFactory",
           "params": {
-            "mapping" : "/lucene-char-mapping.txt"
+            "mapping": "/lucene-char-mapping.txt"
           }
         }
       ],
@@ -22,20 +21,17 @@
         {
           "factory": "oala.cjk.CJKBigramFilterFactory",
           "params": {
-            "outputUnigrams" : "false"
+            "outputUnigrams": "false"
           }
         }
       ]
-
     },
-
-    "alpha" :
-    {
+    "alpha": {
       "charfilters": [
         {
           "factory": "oala.charfilter.MappingCharFilterFactory",
           "params": {
-            "mapping" : "/lucene-char-mapping.txt"
+            "mapping": "/lucene-char-mapping.txt"
           }
         }
       ],
@@ -67,7 +63,7 @@
         {
           "factory": "oala.cjk.CJKBigramFilterFactory",
           "params": {
-            "outputUnigrams" : "false"
+            "outputUnigrams": "false"
           }
         },
         {
@@ -75,33 +71,27 @@
           "params": {}
         }
       ]
-
     },
-    "common_tokens" :
-    {
+    "common_tokens": {
       "tokenizer": {
         "factory": "oala.standard.UAX29URLEmailTokenizerFactory",
         "params": {}
       },
-
       "tokenfilters": [
         {
           "factory": "oala.cjk.CJKBigramFilterFactory",
           "params": {
-            "outputUnigrams" : "false"
+            "outputUnigrams": "false"
           }
         },
         {
           "factory": 
"org.apache.tika.eval.tokens.CJKBigramAwareLengthFilterFactory",
           "params": {
-            "min" : 4,
-            "max" : 20
+            "min": 4,
+            "max": 20
           }
         }
-
       ]
-
     }
-
   }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/profile-reports.xml
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/resources/profile-reports.xml 
b/tika-eval/src/main/resources/profile-reports.xml
index 2a94a97..1f9be6a 100644
--- a/tika-eval/src/main/resources/profile-reports.xml
+++ b/tika-eval/src/main/resources/profile-reports.xml
@@ -98,7 +98,6 @@
     </report>
 
 
-
     <report reportName="Token Count by Detected Language"
             reportFilename="content/num_tokens_by_detected_langs.xlsx"
             format="xlsx"
@@ -117,10 +116,12 @@
             includeSql="true">
 
         <sql>
-            select LANG_ID_1 as DetectedLang, count(1) as cnt
-            from contents
-            group by LANG_ID_1
-            order by cnt desc
+            select parse_exception_description, count(1) cnt
+            from parse_exceptions e
+            join profiles p on p.id = e.id
+            join ref_parse_exception_types et on 
et.parse_exception_type_id=e.parse_exception_type_id
+            group by parse_exception_description
+            order by cnt desc;
         </sql>
     </report>
 

http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/tika-eval-comparison-config.xml
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml 
b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
index b29764e..04ef658 100644
--- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml
+++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml
@@ -20,34 +20,34 @@
 
 <tika-batch-config
         maxAliveTimeSeconds="-1"
-        pauseOnEarlyTerminationMillis = "500"
+        pauseOnEarlyTerminationMillis="500"
         timeoutCheckPulseMillis="1000"
         maxQueueSize="10000"
         numConsumers="5"
         timeoutThresholdMillis="300000"
-        >
+>
 
     <commandline>
-    <option opt="c" longOpt="tika-config" hasArg="true"
-            description="TikaConfig file"/>
-    <option opt="bc" longOpt="batch-config" hasArg="true"
-            description="xml batch config file" required="true"/>
-    <option opt="inputDir" hasArg="true"
-            description="dir to start crawling"/>
-    <option opt="numConsumers" hasArg="true"
-            description="number of fileConsumers threads"/>
-    <option opt="extractsA" hasArg="true"
-            description="this dir for analysis" required="false"/>
-    <option opt="extractsB" hasArg="true"
-            description="thatDir for analysis"/>
-    <option opt="db" hasArg="true"
-            description="name of db directory or file to which to write 
results"/>
-    <option opt="alterExtract" hasArg="true"
+        <option opt="c" longOpt="tika-config" hasArg="true"
+                description="TikaConfig file"/>
+        <option opt="bc" longOpt="batch-config" hasArg="true"
+                description="xml batch config file" required="true"/>
+        <option opt="inputDir" hasArg="true"
+                description="dir to start crawling"/>
+        <option opt="numConsumers" hasArg="true"
+                description="number of fileConsumers threads"/>
+        <option opt="extractsA" hasArg="true"
+                description="this dir for analysis" required="false"/>
+        <option opt="extractsB" hasArg="true"
+                description="thatDir for analysis"/>
+        <option opt="db" hasArg="true"
+                description="name of db directory or file to which to write 
results"/>
+        <option opt="alterExtract" hasArg="true"
                 description="for json-formatted extract files
                 process full metadata list ('as_is'=default),
                 take just the first/container document ('first_only'),
                 concatenate all content into the first metadata item 
('concatenate_content')"/>
-    <option opt="includeFilePat" hasArg="true"
+        <option opt="includeFilePat" hasArg="true"
                 description="regex for files to include"/>
     </commandline>
 
@@ -56,15 +56,15 @@
         Can also add startDir: this tells the crawler to start indexing a
         child directory of the srcDir directory.
     -->
-       <crawler 
builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
-               crawlOrder="sorted"
-        maxConsecWaitMillis="30000"
-               maxFilesToAdd="-1" 
-               maxFilesToConsider="-1" 
-               includeFilePat=""
-               excludeFilePat=""
-               maxFileSizeBytes="10000000"
-        />
+    <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+             crawlOrder="sorted"
+             maxConsecWaitMillis="30000"
+             maxFilesToAdd="-1"
+             maxFilesToConsider="-1"
+             includeFilePat=""
+             excludeFilePat=""
+             maxFileSizeBytes="10000000"
+    />
 
     <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder"
                
consumerBuilderClass="org.apache.tika.eval.batch.FileComparerBuilder"
@@ -73,11 +73,12 @@
                minJsonFileSizeBytes="-1"
                maxJsonFileSizeBytes="2000000"
                commonTokens="resources/commontokens"
-            />
+    />
 
-<!--               langModelDir="resources/langmodels" -->
+    <!--               langModelDir="resources/langmodels" -->
 
-       <!-- reporter and interrupter are optional -->
-       <reporter 
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" 
sleepMillis="1000" staleThresholdMillis="500000"/>
-       <interrupter 
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+    <!-- reporter and interrupter are optional -->
+    <reporter 
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" 
sleepMillis="1000"
+              staleThresholdMillis="500000"/>
+    <interrupter 
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
 </tika-batch-config>

http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml 
b/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
index e8b9d6c..536f39c 100644
--- a/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
+++ b/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
@@ -56,7 +56,7 @@
              includeFilePat=""
              excludeFilePat=""
              maxFileSizeBytes="-1"
-            />
+    />
 
     <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder"
                
consumerBuilderClass="org.apache.tika.eval.batch.SingleFileConsumerBuilder"

http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml 
b/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml
index da59d03..29b7f3b 100644
--- a/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml
+++ b/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml
@@ -56,7 +56,7 @@
              includeFilePat=""
              excludeFilePat=""
              maxFileSizeBytes="-1"
-            />
+    />
 
     <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder"
                
consumerBuilderClass="org.apache.tika.eval.batch.SingleFileConsumerBuilder"

Reply via email to