Repository: tika Updated Branches: refs/heads/master aa7a0c353 -> 506b57256
TIKA-1332 -- fix one report for eval profiler and clean up whitespace Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/506b5725 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/506b5725 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/506b5725 Branch: refs/heads/master Commit: 506b572560f6c7f44270b55877f110719a7d4b1f Parents: aa7a0c3 Author: tballison <talli...@mitre.org> Authored: Thu Feb 16 12:33:58 2017 -0500 Committer: tballison <talli...@mitre.org> Committed: Thu Feb 16 12:33:58 2017 -0500 ---------------------------------------------------------------------- .../src/main/resources/comparison-reports.xml | 2 +- .../src/main/resources/lucene-analyzers.json | 30 +++------ .../src/main/resources/profile-reports.xml | 11 ++-- .../resources/tika-eval-comparison-config.xml | 65 ++++++++++---------- ...ingle-file-profiler-crawl-extract-config.xml | 2 +- .../single-file-profiler-crawl-input-config.xml | 2 +- 6 files changed, 52 insertions(+), 60 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/comparison-reports.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/resources/comparison-reports.xml b/tika-eval/src/main/resources/comparison-reports.xml index cb7befd..d69cb2a 100644 --- a/tika-eval/src/main/resources/comparison-reports.xml +++ b/tika-eval/src/main/resources/comparison-reports.xml @@ -206,7 +206,7 @@ </sql> </report> - <report reportName="Mime Differences A -> B Details" + <report reportName="Mime Differences A -> B Details" reportFilename="mimes/mime_diffs_A_to_B_details.xlsx" format="xlsx" includeSql="true"> http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/lucene-analyzers.json ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/resources/lucene-analyzers.json b/tika-eval/src/main/resources/lucene-analyzers.json index 268494f..f7141f7 100644 --- a/tika-eval/src/main/resources/lucene-analyzers.json +++ b/tika-eval/src/main/resources/lucene-analyzers.json @@ -1,12 +1,11 @@ { "analyzers": { - "general" : - { + "general": { "charfilters": [ { "factory": "oala.charfilter.MappingCharFilterFactory", "params": { - "mapping" : "/lucene-char-mapping.txt" + "mapping": "/lucene-char-mapping.txt" } } ], @@ -22,20 +21,17 @@ { "factory": "oala.cjk.CJKBigramFilterFactory", "params": { - "outputUnigrams" : "false" + "outputUnigrams": "false" } } ] - }, - - "alpha" : - { + "alpha": { "charfilters": [ { "factory": "oala.charfilter.MappingCharFilterFactory", "params": { - "mapping" : "/lucene-char-mapping.txt" + "mapping": "/lucene-char-mapping.txt" } } ], @@ -67,7 +63,7 @@ { "factory": "oala.cjk.CJKBigramFilterFactory", "params": { - "outputUnigrams" : "false" + "outputUnigrams": "false" } }, { @@ -75,33 +71,27 @@ "params": {} } ] - }, - "common_tokens" : - { + "common_tokens": { "tokenizer": { "factory": "oala.standard.UAX29URLEmailTokenizerFactory", "params": {} }, - "tokenfilters": [ { "factory": "oala.cjk.CJKBigramFilterFactory", "params": { - "outputUnigrams" : "false" + "outputUnigrams": "false" } }, { "factory": "org.apache.tika.eval.tokens.CJKBigramAwareLengthFilterFactory", "params": { - "min" : 4, - "max" : 20 + "min": 4, + "max": 20 } } - ] - } - } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/profile-reports.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/resources/profile-reports.xml b/tika-eval/src/main/resources/profile-reports.xml index 2a94a97..1f9be6a 100644 --- a/tika-eval/src/main/resources/profile-reports.xml +++ b/tika-eval/src/main/resources/profile-reports.xml @@ -98,7 +98,6 @@ </report> - <report reportName="Token Count by Detected Language" reportFilename="content/num_tokens_by_detected_langs.xlsx" format="xlsx" @@ -117,10 +116,12 @@ includeSql="true"> <sql> - select LANG_ID_1 as DetectedLang, count(1) as cnt - from contents - group by LANG_ID_1 - order by cnt desc + select parse_exception_description, count(1) cnt + from parse_exceptions e + join profiles p on p.id = e.id + join ref_parse_exception_types et on et.parse_exception_type_id=e.parse_exception_type_id + group by parse_exception_description + order by cnt desc; </sql> </report> http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/main/resources/tika-eval-comparison-config.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/resources/tika-eval-comparison-config.xml b/tika-eval/src/main/resources/tika-eval-comparison-config.xml index b29764e..04ef658 100644 --- a/tika-eval/src/main/resources/tika-eval-comparison-config.xml +++ b/tika-eval/src/main/resources/tika-eval-comparison-config.xml @@ -20,34 +20,34 @@ <tika-batch-config maxAliveTimeSeconds="-1" - pauseOnEarlyTerminationMillis = "500" + pauseOnEarlyTerminationMillis="500" timeoutCheckPulseMillis="1000" maxQueueSize="10000" numConsumers="5" timeoutThresholdMillis="300000" - > +> <commandline> - <option opt="c" longOpt="tika-config" hasArg="true" - description="TikaConfig file"/> - <option opt="bc" longOpt="batch-config" hasArg="true" - description="xml batch config file" required="true"/> - <option opt="inputDir" hasArg="true" - description="dir to start crawling"/> - <option opt="numConsumers" hasArg="true" - description="number of fileConsumers threads"/> - <option opt="extractsA" hasArg="true" - description="this dir for analysis" required="false"/> - <option opt="extractsB" hasArg="true" - description="thatDir for analysis"/> - <option opt="db" hasArg="true" - description="name of db directory or file to which to write results"/> - <option opt="alterExtract" hasArg="true" + <option opt="c" longOpt="tika-config" hasArg="true" + description="TikaConfig file"/> + <option opt="bc" longOpt="batch-config" hasArg="true" + description="xml batch config file" required="true"/> + <option opt="inputDir" hasArg="true" + description="dir to start crawling"/> + <option opt="numConsumers" hasArg="true" + description="number of fileConsumers threads"/> + <option opt="extractsA" hasArg="true" + description="this dir for analysis" required="false"/> + <option opt="extractsB" hasArg="true" + description="thatDir for analysis"/> + <option opt="db" hasArg="true" + description="name of db directory or file to which to write results"/> + <option opt="alterExtract" hasArg="true" description="for json-formatted extract files process full metadata list ('as_is'=default), take just the first/container document ('first_only'), concatenate all content into the first metadata item ('concatenate_content')"/> - <option opt="includeFilePat" hasArg="true" + <option opt="includeFilePat" hasArg="true" description="regex for files to include"/> </commandline> @@ -56,15 +56,15 @@ Can also add startDir: this tells the crawler to start indexing a child directory of the srcDir directory. --> - <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder" - crawlOrder="sorted" - maxConsecWaitMillis="30000" - maxFilesToAdd="-1" - maxFilesToConsider="-1" - includeFilePat="" - excludeFilePat="" - maxFileSizeBytes="10000000" - /> + <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder" + crawlOrder="sorted" + maxConsecWaitMillis="30000" + maxFilesToAdd="-1" + maxFilesToConsider="-1" + includeFilePat="" + excludeFilePat="" + maxFileSizeBytes="10000000" + /> <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder" consumerBuilderClass="org.apache.tika.eval.batch.FileComparerBuilder" @@ -73,11 +73,12 @@ minJsonFileSizeBytes="-1" maxJsonFileSizeBytes="2000000" commonTokens="resources/commontokens" - /> + /> -<!-- langModelDir="resources/langmodels" --> + <!-- langModelDir="resources/langmodels" --> - <!-- reporter and interrupter are optional --> - <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000" staleThresholdMillis="500000"/> - <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000" + staleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> </tika-batch-config> http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml b/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml index e8b9d6c..536f39c 100644 --- a/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml +++ b/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml @@ -56,7 +56,7 @@ includeFilePat="" excludeFilePat="" maxFileSizeBytes="-1" - /> + /> <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder" consumerBuilderClass="org.apache.tika.eval.batch.SingleFileConsumerBuilder" http://git-wip-us.apache.org/repos/asf/tika/blob/506b5725/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml b/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml index da59d03..29b7f3b 100644 --- a/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml +++ b/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml @@ -56,7 +56,7 @@ includeFilePat="" excludeFilePat="" maxFileSizeBytes="-1" - /> + /> <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder" consumerBuilderClass="org.apache.tika.eval.batch.SingleFileConsumerBuilder"