[6/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

tallison Thu, 16 Feb 2017 09:19:17 -0800

TIKA-1332 -- initial commit for tika-eval module. More work remains.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/aa7a0c35
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/aa7a0c35
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/aa7a0c35

Branch: refs/heads/master
Commit: aa7a0c353362d56cb1b8e77297f0807626b0246c
Parents: b9befb4
Author: tballison <[email protected]>
Authored: Thu Feb 16 12:18:32 2017 -0500
Committer: tballison <[email protected]>
Committed: Thu Feb 16 12:18:32 2017 -0500

----------------------------------------------------------------------
 CHANGES.txt                                     |   2 +
 LICENSE.txt                                     |   8 +
 pom.xml                                         |   1 +
 tika-eval/pom.xml                               | 281 +++++++
 .../org/apache/tika/eval/AbstractProfiler.java  | 693 ++++++++++++++++
 .../org/apache/tika/eval/EvalFilePaths.java     | 108 +++
 .../org/apache/tika/eval/ExtractComparer.java   | 455 +++++++++++
 .../org/apache/tika/eval/ExtractProfiler.java   | 238 ++++++
 .../java/org/apache/tika/eval/TikaEvalCLI.java  | 262 ++++++
 .../apache/tika/eval/XMLErrorLogUpdater.java    | 226 ++++++
 .../tika/eval/batch/DBConsumersManager.java     |  92 +++
 .../tika/eval/batch/EvalConsumerBuilder.java    | 134 ++++
 .../tika/eval/batch/EvalConsumersBuilder.java   | 133 ++++
 .../tika/eval/batch/FileComparerBuilder.java    | 122 +++
 .../eval/batch/SingleFileConsumerBuilder.java   | 108 +++
 .../apache/tika/eval/db/AbstractDBBuffer.java   |  77 ++
 .../java/org/apache/tika/eval/db/ColInfo.java   | 116 +++
 .../main/java/org/apache/tika/eval/db/Cols.java |  90 +++
 .../java/org/apache/tika/eval/db/DBBuffer.java  |  54 ++
 .../java/org/apache/tika/eval/db/DBUtil.java    | 201 +++++
 .../java/org/apache/tika/eval/db/H2Util.java    |  71 ++
 .../org/apache/tika/eval/db/MimeBuffer.java     | 144 ++++
 .../java/org/apache/tika/eval/db/TableInfo.java |  64 ++
 .../java/org/apache/tika/eval/io/DBWriter.java  | 141 ++++
 .../org/apache/tika/eval/io/ExtractReader.java  | 161 ++++
 .../java/org/apache/tika/eval/io/IDBWriter.java |  31 +
 .../apache/tika/eval/io/XMLLogMsgHandler.java   |  26 +
 .../org/apache/tika/eval/io/XMLLogReader.java   | 120 +++
 .../org/apache/tika/eval/reports/Report.java    | 197 +++++
 .../tika/eval/reports/ResultsReporter.java      | 295 +++++++
 .../tika/eval/reports/XLSXHREFFormatter.java    |  79 ++
 .../tika/eval/reports/XLSXNumFormatter.java     |  54 ++
 .../tika/eval/reports/XSLXCellFormatter.java    |  30 +
 .../tokens/AlphaIdeographFilterFactory.java     |  74 ++
 .../tika/eval/tokens/AnalyzerDeserializer.java  | 345 ++++++++
 .../tika/eval/tokens/AnalyzerManager.java       |  95 +++
 .../CJKBigramAwareLengthFilterFactory.java      |  74 ++
 .../eval/tokens/CommonTokenCountManager.java    | 141 ++++
 .../tika/eval/tokens/CommonTokenResult.java     |  37 +
 .../tika/eval/tokens/ContrastStatistics.java    |  78 ++
 .../tika/eval/tokens/TokenContraster.java       | 183 +++++
 .../eval/tokens/TokenCountPriorityQueue.java    |  49 ++
 .../apache/tika/eval/tokens/TokenCounter.java   | 167 ++++
 .../apache/tika/eval/tokens/TokenIntPair.java   |  82 ++
 .../tika/eval/tokens/TokenStatistics.java       | 127 +++
 .../tika/eval/util/LanguageIDWrapper.java       |  69 ++
 ...ache.lucene.analysis.util.TokenFilterFactory |  17 +
 .../src/main/resources/comparison-reports.xml   | 791 +++++++++++++++++++
 .../src/main/resources/lucene-analyzers.json    | 107 +++
 .../src/main/resources/lucene-char-mapping.txt  |   2 +
 .../src/main/resources/profile-reports.xml      | 148 ++++
 .../resources/tika-eval-comparison-config.xml   |  83 ++
 .../resources/tika-eval-profiler-config.xml     |  76 ++
 .../test/java/org/apache/tika/MockDBWriter.java |  73 ++
 .../apache/tika/eval/AnalyzerManagerTest.java   |  79 ++
 .../org/apache/tika/eval/ComparerBatchTest.java | 411 ++++++++++
 .../org/apache/tika/eval/ProfilerBatchTest.java | 236 ++++++
 .../apache/tika/eval/SimpleComparerTest.java    | 289 +++++++
 .../org/apache/tika/eval/TikaEvalCLITest.java   |  42 +
 .../apache/tika/eval/db/AbstractBufferTest.java | 160 ++++
 .../apache/tika/eval/io/ExtractReaderTest.java  |  85 ++
 .../tika/eval/io/FatalExceptionReaderTest.java  |  32 +
 .../tika/eval/reports/ResultsReporterTest.java  |  60 ++
 .../tika/eval/tokens/LuceneTokenCounter.java    | 191 +++++
 .../tika/eval/tokens/TokenCounterTest.java      | 131 +++
 .../org/apache/tika/eval/util/MimeUtilTest.java |  65 ++
 tika-eval/src/test/resources/commontokens/en    |   8 +
 tika-eval/src/test/resources/commontokens/es    |  10 +
 tika-eval/src/test/resources/commontokens/zh-cn |   8 +
 tika-eval/src/test/resources/commontokens/zh-tw |   8 +
 tika-eval/src/test/resources/log4j.properties   |  11 +
 .../src/test/resources/log4j_process.properties |  11 +
 ...ingle-file-profiler-crawl-extract-config.xml |  72 ++
 .../single-file-profiler-crawl-input-config.xml |  73 ++
 .../batch-logs/batch-process-fatal.xml          |  59 ++
 .../test-dirs/extractsA/file1.pdf.json          |   5 +
 .../extractsA/file10_permahang.txt.json         |   0
 .../test-dirs/extractsA/file11_oom.txt.json     |   0
 .../test-dirs/extractsA/file12_es.txt.json      |   4 +
 .../extractsA/file13_attachANotB.doc.json       |  10 +
 .../extractsA/file2_attachANotB.doc.json        |  10 +
 .../extractsA/file3_attachBNotA.doc.json        |   4 +
 .../test-dirs/extractsA/file4_emptyB.pdf.json   |   4 +
 .../test-dirs/extractsA/file5_emptyA.pdf.json   |   0
 .../test-dirs/extractsA/file6_accessEx.pdf.json |   1 +
 .../test-dirs/extractsA/file7_badJson.pdf.json  |   4 +
 .../test-dirs/extractsA/file8_IOEx.pdf.json     |   1 +
 .../test-dirs/extractsB/file1.pdf.json          |   2 +
 .../test-dirs/extractsB/file11_oom.txt.json     |   0
 .../test-dirs/extractsB/file12_es.txt.json      |   4 +
 .../extractsB/file13_attachANotB.doc.txt        |   1 +
 .../extractsB/file2_attachANotB.doc.json        |   4 +
 .../extractsB/file3_attachBNotA.doc.json        |  10 +
 .../test-dirs/extractsB/file4_emptyB.pdf.json   |   0
 .../test-dirs/extractsB/file5_emptyA.pdf.json   |   4 +
 .../test-dirs/extractsB/file6_accessEx.pdf.json |   1 +
 .../test-dirs/extractsB/file7_badJson.pdf.json  |   0
 .../test-dirs/extractsB/file8_IOEx.pdf.json     |   1 +
 .../resources/test-dirs/raw_input/file1.pdf     |  13 +
 .../test-dirs/raw_input/file11_oom.txt          |   2 +
 .../test-dirs/raw_input/file2_attachANotB.doc   |  13 +
 .../test-dirs/raw_input/file3_attachBNotA.doc   |  13 +
 .../test-dirs/raw_input/file4_emptyB.pdf        |  13 +
 .../test-dirs/raw_input/file5_emptyA.pdf        |  13 +
 .../test-dirs/raw_input/file6_accessEx.pdf      |  13 +
 .../test-dirs/raw_input/file7_badJson.pdf       |  13 +
 .../test-dirs/raw_input/file8_IOEx.pdf          |  13 +
 .../test-dirs/raw_input/file9_noextract.txt     |   1 +
 108 files changed, 9850 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/CHANGES.txt
----------------------------------------------------------------------
diff --git a/CHANGES.txt b/CHANGES.txt
index b8e2dec..55c8906 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,7 @@
 Release 1.15 - ??
 
+  * tika-eval module (TIKA-1332).
+
   * Add parsers for EMF/WMF files (TIKA-2246/TIKA-2247).
 
   * Official mime types for BMP, EMF and WMF have been registered with

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/LICENSE.txt
----------------------------------------------------------------------
diff --git a/LICENSE.txt b/LICENSE.txt
index aa44a0d..e3cd6ff 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -378,3 +378,11 @@ Sample DXF file testDXF.dxf (in 
tika-parsers/src/test/resources/test-documents)
      documentation for any purpose is hereby granted without fee, provided 
      that the above copyright notice, author statement appear in all copies 
      of this software and related documentation.
+
+H2 Database in tika-eval
+    This software contains unmodified binary redistributions for
+    H2 database engine (http://www.h2database.com/),
+    which is dual licensed and available under the MPL 2.0
+    (Mozilla Public License) or under the EPL 1.0 (Eclipse Public License).
+    An original copy of the license agreement can be found at:
+    http://www.h2database.com/html/license.html

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index fa690d1..56ed0ad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -48,6 +48,7 @@
     <module>tika-langdetect</module>
     <module>tika-example</module>
     <module>tika-java7</module>
+    <module>tika-eval</module>
   </modules>
 
   <profiles>

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/pom.xml
----------------------------------------------------------------------
diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml
new file mode 100644
index 0000000..ec2c18b
--- /dev/null
+++ b/tika-eval/pom.xml
@@ -0,0 +1,281 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0";
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+
+    <modelVersion>4.0.0</modelVersion>
+    <properties>
+        <cli.version>1.3.1</cli.version> <!--sync version with tika-server or 
move to parent? -->
+        <lucene.version>6.2.1</lucene.version>
+        <poi.version>3.16-beta2</poi.version>
+
+    </properties>
+
+    <parent>
+        <groupId>org.apache.tika</groupId>
+        <artifactId>tika-parent</artifactId>
+        <version>1.15-SNAPSHOT</version>
+        <relativePath>../tika-parent/pom.xml</relativePath>
+    </parent>
+
+    <artifactId>tika-eval</artifactId>
+    <name>Apache Tika eval</name>
+    <url>http://tika.apache.org/</url>
+
+    <dependencies>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-batch</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>${project.groupId}</groupId>
+            <artifactId>tika-serialization</artifactId>
+            <version>${project.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-math3</artifactId>
+            <version>3.4.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-csv</artifactId>
+            <version>1.1</version>
+        </dependency>
+        <dependency>
+            <groupId>com.h2database</groupId>
+            <artifactId>h2</artifactId>
+            <version>1.4.193</version>
+        </dependency>
+        <dependency>
+            <groupId>com.optimaize.languagedetector</groupId>
+            <artifactId>language-detector</artifactId>
+            <version>0.5</version>
+        </dependency>
+        <dependency>
+            <groupId>commons-cli</groupId>
+            <artifactId>commons-cli</artifactId>
+            <version>${cli.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>commons-io</groupId>
+            <artifactId>commons-io</artifactId>
+            <version>2.4</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-core</artifactId>
+            <version>${lucene.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-common</artifactId>
+            <version>${lucene.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-icu</artifactId>
+            <version>${lucene.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-memory</artifactId>
+            <version>${lucene.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-lang3</artifactId>
+            <version>3.4</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.apache.poi</groupId>
+            <artifactId>poi</artifactId>
+            <version>${poi.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.poi</groupId>
+            <artifactId>poi-ooxml</artifactId>
+            <version>${poi.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.poi</groupId>
+            <artifactId>poi-ooxml-schemas</artifactId>
+            <version>${poi.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.poi</groupId>
+            <artifactId>poi-scratchpad</artifactId>
+            <version>${poi.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-batch</artifactId>
+            <version>${project.version}</version>
+            <type>test-jar</type>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-core</artifactId>
+            <version>${project.version}</version>
+            <type>test-jar</type>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <scope>test</scope>
+        </dependency>
+
+
+    </dependencies>
+    <build>
+        <plugins>
+            <plugin>
+                <artifactId>maven-shade-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>shade</goal>
+                        </goals>
+                        <configuration>
+                            <createDependencyReducedPom>
+                                false
+                            </createDependencyReducedPom>
+                            <!-- <filters> -->
+                            <transformers>
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+                                    
<mainClass>org.apache.tika.eval.TikaEvalCLI</mainClass>
+                                </transformer>
+
+                                <transformer 
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"
 />
+                            </transformers>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+                <plugin>
+                    <artifactId>maven-remote-resources-plugin</artifactId>
+                    <version>1.5</version>
+                    <executions>
+                        <execution>
+                            <goals>
+                                <goal>bundle</goal>
+                            </goals>
+                        </execution>
+                    </executions>
+                    <configuration>
+                        <includes>
+                            <include>**/*.xml</include>
+                        </includes>
+                    </configuration>
+                </plugin>
+
+                <plugin>
+                    <groupId>org.apache.felix</groupId>
+                    <artifactId>maven-bundle-plugin</artifactId>
+                    <extensions>true</extensions>
+                    <configuration>
+                        <instructions>
+                            <Bundle-DocURL>${project.url}</Bundle-DocURL>
+                            <Bundle-Activator>
+                                org.apache.tika.config.TikaActivator
+                            </Bundle-Activator>
+                            
<Bundle-ActivationPolicy>lazy</Bundle-ActivationPolicy>
+                        </instructions>
+                    </configuration>
+                </plugin>
+                <plugin>
+                    <groupId>org.apache.rat</groupId>
+                    <artifactId>apache-rat-plugin</artifactId>
+                    <configuration>
+                        <excludes>
+                            
<exclude>src/test/resources/org/apache/tika/**</exclude>
+                        </excludes>
+                    </configuration>
+                </plugin>
+                <plugin>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-jar-plugin</artifactId>
+                    <executions>
+                        <execution>
+                            <goals>
+                                <goal>test-jar</goal>
+                            </goals>
+                        </execution>
+                    </executions>
+                </plugin>
+                <plugin>
+                    <artifactId>maven-failsafe-plugin</artifactId>
+                    <version>2.10</version>
+                    <configuration>
+                        <additionalClasspathElements>
+                            <additionalClasspathElement>
+                                
${project.build.directory}/${project.build.finalName}.jar
+                            </additionalClasspathElement>
+                        </additionalClasspathElements>
+                    </configuration>
+                    <executions>
+                        <execution>
+                            <goals>
+                                <goal>integration-test</goal>
+                                <goal>verify</goal>
+                            </goals>
+                        </execution>
+                    </executions>
+                </plugin>
+            </plugins>
+
+    </build>
+
+
+
+    <organization>
+        <name>The Apache Software Foundation</name>
+        <url>http://www.apache.org</url>
+    </organization>
+    <scm>
+        <url>http://svn.apache.org/viewvc/tika/trunk/tika-batch</url>
+        
<connection>scm:svn:http://svn.apache.org/repos/asf/tika/trunk/tika-batch</connection>
+        
<developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/trunk/tika-batch</developerConnection>
+    </scm>
+    <issueManagement>
+        <system>JIRA</system>
+        <url>https://issues.apache.org/jira/browse/TIKA</url>
+    </issueManagement>
+    <ciManagement>
+        <system>Jenkins</system>
+        <url>https://builds.apache.org/job/Tika-trunk/</url>
+    </ciManagement>
+
+
+</project>

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java 
b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
new file mode 100644
index 0000000..24f7358
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
@@ -0,0 +1,693 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval;
+
+
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Types;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import com.optimaize.langdetect.DetectedLanguage;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.lang3.tuple.Pair;
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.db.ColInfo;
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.eval.io.IDBWriter;
+import org.apache.tika.eval.tokens.AnalyzerManager;
+import org.apache.tika.eval.tokens.CommonTokenCountManager;
+import org.apache.tika.eval.tokens.CommonTokenResult;
+import org.apache.tika.eval.tokens.TokenCounter;
+import org.apache.tika.eval.tokens.TokenIntPair;
+import org.apache.tika.eval.tokens.TokenStatistics;
+import org.apache.tika.eval.util.LanguageIDWrapper;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.PagedText;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.utils.ExceptionUtils;
+
+public abstract class AbstractProfiler extends FileResourceConsumer {
+
+    private static final String[] EXTRACT_EXTENSIONS = {
+            ".json",
+            ".txt",
+            ""
+    };
+
+    private static final String[] COMPRESSION_EXTENSIONS = {
+            "",
+            ".bz2",
+            ".gzip",
+            ".zip",
+    };
+    static final long NON_EXISTENT_FILE_LENGTH = -1l;
+
+    public static TableInfo REF_EXTRACT_ERROR_TYPES = new 
TableInfo("ref_extract_error_types",
+            new ColInfo(Cols.EXTRACT_ERROR_TYPE_ID, Types.INTEGER),
+            new ColInfo(Cols.EXTRACT_ERROR_DESCRIPTION, Types.VARCHAR, 128)
+    );
+
+
+    public static TableInfo REF_PARSE_ERROR_TYPES = new 
TableInfo("ref_parse_error_types",
+            new ColInfo(Cols.PARSE_ERROR_TYPE_ID, Types.INTEGER),
+            new ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128)
+    );
+
+    public static TableInfo REF_PARSE_EXCEPTION_TYPES = new 
TableInfo("ref_parse_exception_types",
+            new ColInfo(Cols.PARSE_EXCEPTION_TYPE_ID, Types.INTEGER),
+            new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128)
+    );
+
+    public static final String TRUE = Boolean.toString(true);
+    public static final String FALSE = Boolean.toString(false);
+
+
+    protected static final AtomicInteger CONTAINER_ID = new AtomicInteger();
+    protected static final AtomicInteger ID = new AtomicInteger();
+
+
+    private final static String UNKNOWN_EXTENSION = "unk";
+    //make this configurable
+    private final static String DIGEST_KEY = "X-TIKA:digest:MD5";
+
+    private static CommonTokenCountManager commonTokenCountManager;
+    private String lastExtractExtension = null;
+
+    final AnalyzerManager analyzerManager;
+    final TokenCounter tokenCounter;
+
+    public enum EXTRACT_ERROR_TYPE {
+        //what do you see when you look at the extract file
+        NO_EXTRACT_FILE,
+        ZERO_BYTE_EXTRACT_FILE,
+        EXTRACT_PARSE_EXCEPTION
+    }
+
+    public enum EXCEPTION_TYPE {
+        RUNTIME,
+        ENCRYPTION,
+        ACCESS_PERMISSION,
+        UNSUPPORTED_VERSION,
+    }
+
+    public enum PARSE_ERROR_TYPE {
+        //what was gathered from the log file during the batch run
+        OOM,
+        TIMEOUT
+    }
+
+    public static TableInfo MIME_TABLE = new TableInfo("mimes",
+            new ColInfo(Cols.MIME_TYPE_ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256),
+            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12)
+    );
+
+    private static Pattern FILE_NAME_CLEANER = 
Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$");
+
+
+    final static int FILE_PATH_MAX_LEN = 512;//max len for varchar for 
file_path
+    final static int MAX_STRING_LENGTH = 1000000;
+    final static int MAX_LEN_FOR_LANG_ID = 20000;
+
+    //these remove runtime info from the stacktraces so
+    //that actual causes can be counted.
+    private final static Pattern CAUSED_BY_SNIPPER =
+            Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+");
+
+    private final static Pattern ACCESS_PERMISSION_EXCEPTION =
+            
Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException");
+    private final static Pattern ENCRYPTION_EXCEPTION =
+            
Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException");
+
+    private TikaConfig config = TikaConfig.getDefaultConfig();//TODO: allow 
configuration
+    final LanguageIDWrapper langIder;
+    protected IDBWriter writer;
+
+    public static void loadCommonTokens(Path p) throws IOException {
+        commonTokenCountManager = new CommonTokenCountManager(p);
+    }
+
+    public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue,
+                            IDBWriter writer) {
+        super(fileQueue);
+        this.writer = writer;
+        langIder = new LanguageIDWrapper();
+        try {
+            analyzerManager = AnalyzerManager.newInstance();
+            tokenCounter = new 
TokenCounter(analyzerManager.getGeneralAnalyzer(),
+                    analyzerManager.getAlphaIdeoAnalyzer());
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    protected void writeError(TableInfo extractErrorTable, String containerId,
+                              String filePath, Path extractsA) throws 
IOException {
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.CONTAINER_ID, containerId);
+        data.put(Cols.FILE_PATH, filePath);
+        int errorCode = -1;
+        long len = -1;
+        if (extractsA != null) {
+            try {
+                len = Files.size(extractsA);
+            } catch (IOException e) {
+                //swallow
+            }
+        }
+        if (extractsA == null) {
+            errorCode = EXTRACT_ERROR_TYPE.NO_EXTRACT_FILE.ordinal();
+        } else if (len == 0) {
+            errorCode = EXTRACT_ERROR_TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal();
+        } else {
+            errorCode = EXTRACT_ERROR_TYPE.EXTRACT_PARSE_EXCEPTION.ordinal();
+        }
+        data.put(Cols.EXTRACT_ERROR_TYPE_ID, Integer.toString(errorCode));
+        writer.writeRow(extractErrorTable, data);
+
+    }
+
+    protected void writeProfileData(EvalFilePaths fps, int i, Metadata m,
+                                    String fileId, String containerId,
+                                    List<Integer> numAttachments, TableInfo 
profileTable) {
+
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+        data.put(Cols.CONTAINER_ID, containerId);
+        data.put(Cols.MD5, m.get(DIGEST_KEY));
+
+        if ( i < numAttachments.size()) {
+            data.put(Cols.NUM_ATTACHMENTS, 
Integer.toString(numAttachments.get(i)));
+        }
+        data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m));
+        data.put(Cols.NUM_METADATA_VALUES,
+                Integer.toString(countMetadataValues(m)));
+
+        Integer nPages = m.getInt(PagedText.N_PAGES);
+        if (nPages != null) {
+            data.put(Cols.NUM_PAGES, Integer.toString(nPages));
+        }
+
+        //if the outer wrapper document
+        if (i == 0) {
+
+            data.put(Cols.IS_EMBEDDED, FALSE);
+            data.put(Cols.FILE_NAME, 
fps.getRelativeSourceFilePath().getFileName().toString());
+        } else {
+            data.put(Cols.IS_EMBEDDED, TRUE);
+            data.put(Cols.FILE_NAME, 
FilenameUtils.getName(m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
+        }
+        data.put(Cols.FILE_EXTENSION,
+                
FilenameUtils.getExtension(fps.getRelativeSourceFilePath().getFileName().toString()));
+        long srcFileLen = getSourceFileLength(m);
+        if (srcFileLen > NON_EXISTENT_FILE_LENGTH) {
+            data.put(Cols.LENGTH, Long.toString(srcFileLen));
+        } else {
+            data.put(Cols.LENGTH, "");
+        }
+        int numMetadataValues = countMetadataValues(m);
+        data.put(Cols.NUM_METADATA_VALUES,
+                Integer.toString(numMetadataValues));
+
+        data.put(Cols.ELAPSED_TIME_MILLIS,
+                getTime(m));
+
+        String content = getContent(m, MAX_STRING_LENGTH);
+        if (content == null || content.trim().length() == 0) {
+            data.put(Cols.HAS_CONTENT, FALSE);
+        } else {
+            data.put(Cols.HAS_CONTENT, TRUE);
+        }
+        getFileTypes(m, data);
+        try {
+            writer.writeRow(profileTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    protected void writeExceptionData(String fileId, Metadata m, TableInfo 
exceptionTable) {
+        Map<Cols, String> data = new HashMap<>();
+        getExceptionStrings(m, data);
+        if (data.keySet().size() > 0) {
+            try {
+                data.put(Cols.ID, fileId);
+                writer.writeRow(exceptionTable, data);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    /**
+     * Checks to see if metadata is null or content is empty (null or only 
whitespace).
+     * If any of these, then this does no processing, and the fileId is not
+     * entered into the content table.
+     *
+     * @param fileId
+     * @param m
+     * @param fieldName
+     * @param contentsTable
+     */
+    protected void writeContentData(String fileId, Metadata m,
+                                    String fieldName, TableInfo contentsTable) 
throws IOException {
+        if (m == null) {
+            return;
+        }
+
+        String content = getContent(m, MAX_STRING_LENGTH);
+        if (content == null || content.trim().length() == 0) {
+            return;
+        }
+        tokenCounter.clear(fieldName);
+        tokenCounter.add(fieldName, content);
+
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+        data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length()));
+        langid(m, data);
+        String langid = data.get(Cols.LANG_ID_1);
+        langid = (langid == null) ? "" : langid;
+
+        writeTokenCounts(data, fieldName, tokenCounter);
+        CommonTokenResult commonTokenResult = null;
+        try {
+            commonTokenResult = 
commonTokenCountManager.countTokenOverlaps(langid,
+                    tokenCounter.getAlphaTokens(fieldName));
+        } catch (IOException e) {
+            logger.error(e.getMessage(), e);
+        }
+        data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode());
+        data.put(Cols.NUM_COMMON_TOKENS, 
Integer.toString(commonTokenResult.getTokens()));
+        TokenStatistics tokenStatistics = 
tokenCounter.getTokenStatistics(fieldName);
+        TokenStatistics alphaTokenStatistics = 
tokenCounter.getAlphaTokenStatistics(fieldName);
+        data.put(Cols.NUM_UNIQUE_TOKENS,
+                Integer.toString(tokenStatistics.getTotalUniqueTokens()));
+        data.put(Cols.NUM_TOKENS,
+                Integer.toString(tokenStatistics.getTotalTokens()));
+        data.put(Cols.NUM_ALPHABETIC_TOKENS,
+                Integer.toString(alphaTokenStatistics.getTotalTokens()));
+
+        data.put(Cols.TOKEN_ENTROPY_RATE,
+                Double.toString(tokenStatistics.getEntropy()));
+        SummaryStatistics summStats = tokenStatistics.getSummaryStatistics();
+        data.put(Cols.TOKEN_LENGTH_SUM,
+                Integer.toString((int) summStats.getSum()));
+
+        data.put(Cols.TOKEN_LENGTH_MEAN,
+                Double.toString(summStats.getMean()));
+
+        data.put(Cols.TOKEN_LENGTH_STD_DEV,
+                Double.toString(summStats.getStandardDeviation()));
+        unicodeBlocks(m, data);
+        try {
+            writer.writeRow(contentsTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    String getTime(Metadata m) {
+        String elapsed = "-1";
+
+        String v = m.get(RecursiveParserWrapper.PARSE_TIME_MILLIS);
+        if (v != null) {
+            return v;
+        }
+        return elapsed;
+    }
+
+    int countMetadataValues(Metadata m) {
+        if (m == null) {
+            return 0;
+        }
+        int i = 0;
+        for (String n : m.names()) {
+            i += m.getValues(n).length;
+        }
+        return i;
+    }
+
+    void getExceptionStrings(Metadata metadata, Map<Cols, String> data) {
+
+        String fullTrace = 
metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime");
+
+        if (fullTrace == null) {
+            fullTrace = 
metadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION);
+        }
+
+        if (fullTrace != null) {
+            //check for "expected" exceptions...exceptions
+            //that can't be fixed.
+            //Do not store trace for "expected" exceptions
+
+            Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace);
+            if (matcher.find()) {
+                data.put(Cols.PARSE_EXCEPTION_TYPE_ID,
+                        
Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()));
+                return;
+            }
+            matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace);
+            if (matcher.find()) {
+                data.put(Cols.PARSE_EXCEPTION_TYPE_ID,
+                        Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal()));
+                return;
+            }
+
+            data.put(Cols.PARSE_EXCEPTION_TYPE_ID,
+                    Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal()));
+
+            data.put(Cols.ORIG_STACK_TRACE, fullTrace);
+            //TikaExceptions can have object ids, as in the "@2b1ea6ee" in:
+            //org.apache.tika.exception.TikaException: TIKA-198: Illegal
+            //IOException from 
org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee
+            //For reporting purposes, let's snip off the object id so that we 
can more
+            //easily count exceptions.
+            String sortTrace = ExceptionUtils.trimMessage(fullTrace);
+
+            matcher = CAUSED_BY_SNIPPER.matcher(sortTrace);
+            sortTrace = matcher.replaceAll("$1");
+            sortTrace = sortTrace.replaceAll("org.apache.tika.", "o.a.t.");
+            data.put(Cols.SORT_STACK_TRACE, sortTrace);
+        }
+    }
+
+    protected static String getContent(Metadata metadata, int maxLength) {
+        if (metadata == null) {
+            return "";
+        }
+        String c = metadata.get(RecursiveParserWrapper.TIKA_CONTENT);
+        if (c == null) {
+            return "";
+        }
+        if (c.length() > maxLength) {
+            c = c.substring(0, maxLength);
+        }
+        return c;
+    }
+
+    void unicodeBlocks(Metadata metadata, Map<Cols, String> data) {
+        String content = getContent(metadata, MAX_LEN_FOR_LANG_ID);
+        if (content.length() < 200) {
+            return;
+        }
+        String s = content;
+        if (content.length() > MAX_LEN_FOR_LANG_ID) {
+            s = content.substring(0, MAX_LEN_FOR_LANG_ID);
+        }
+        Map<String, Integer> m = new HashMap<>();
+        Reader r = new StringReader(s);
+        try {
+            int c = r.read();
+            while (c != -1) {
+                Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
+                String blockString = (block == null) ? "NULL" : 
block.toString();
+                Integer i = m.get(blockString);
+                if (i == null) {
+                    i = 0;
+                }
+                i++;
+                if (block == null) {
+                    blockString = "NULL";
+                }
+                m.put(blockString, i);
+                c = r.read();
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+            //swallow
+        }
+
+        List<Pair<String, Integer>> pairs = new ArrayList<>();
+        for (Map.Entry<String, Integer> e : m.entrySet()) {
+            pairs.add(Pair.of(e.getKey(), e.getValue()));
+        }
+        Collections.sort(pairs, new Comparator<Pair<String, Integer>>() {
+            @Override
+            public int compare(Pair<String, Integer> o1, Pair<String, Integer> 
o2) {
+                return o2.getValue().compareTo(o1.getValue());
+            }
+        });
+        StringBuilder sb = new StringBuilder();
+
+        for (int i = 0; i < 20 && i < pairs.size(); i++) {
+            if (i > 0) {
+                sb.append(" | ");
+            }
+            sb.append(pairs.get(i).getKey()+": "+pairs.get(i).getValue());
+        }
+        data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString());
+    }
+
+    void langid(Metadata metadata, Map<Cols, String> data) {
+        String content = getContent(metadata, MAX_LEN_FOR_LANG_ID);
+        if (content.length() < 50) {
+            return;
+        }
+        String s = content;
+        if (content.length() > MAX_LEN_FOR_LANG_ID) {
+            s = content.substring(0, MAX_LEN_FOR_LANG_ID);
+        }
+        List<DetectedLanguage> probabilities = langIder.getProbabilities(s);
+        if (probabilities.size() > 0) {
+            data.put(Cols.LANG_ID_1, 
probabilities.get(0).getLocale().getLanguage());
+            data.put(Cols.LANG_ID_PROB_1,
+            Double.toString(probabilities.get(0).getProbability()));
+        }
+        if (probabilities.size() > 1) {
+            data.put(Cols.LANG_ID_2, 
probabilities.get(1).getLocale().getLanguage());
+            data.put(Cols.LANG_ID_PROB_2,
+            Double.toString(probabilities.get(1).getProbability()));
+        }
+
+    }
+
+    void getFileTypes(Metadata metadata, Map<Cols, String> output) {
+        if (metadata == null) {
+            return;
+        }
+        String type = metadata.get(Metadata.CONTENT_TYPE);
+        if (type == null) {
+            return;
+        }
+        int mimeId = writer.getMimeId(type);
+        output.put(Cols.MIME_TYPE_ID, Integer.toString(mimeId));
+    }
+
+    void writeTokenCounts(Map<Cols, String> data, String field,
+                          TokenCounter tokenCounter) {
+
+
+        int stops = 0;
+        int i = 0;
+        StringBuilder sb = new StringBuilder();
+        TokenStatistics tokenStatistics = 
tokenCounter.getTokenStatistics(field);
+        for (TokenIntPair t : tokenStatistics.getTopN()) {
+            if (i++ > 0) {
+                sb.append(" | ");
+            }
+            sb.append(t.getToken() + ": " + t.getValue());
+        }
+
+        data.put(Cols.TOP_N_TOKENS, sb.toString());
+    }
+
+
+    public void closeWriter() throws IOException {
+        writer.close();
+    }
+
+
+    /**
+     *
+     * @param metadata
+     * @param extractDir
+     * @return evalfilepaths for files if crawling an extract directory
+     */
+    protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata,
+                                                     Path extractDir) {
+        String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH);
+        Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath);
+        Path relativeSourceFilePath = Paths.get(m.replaceAll(""));
+        //just try slapping the relextractfilepath on the extractdir
+        Path extractFile = extractDir.resolve(relExtractFilePath);
+        if (! Files.isRegularFile(extractFile)) {
+            //if that doesn't work, try to find the right extract file.
+            //This is necessary if crawling extractsA and trying to find a 
file in
+            //extractsB that is not in the same format: json vs txt or 
compressed
+            extractFile = findFile(extractDir, relativeSourceFilePath);
+        }
+        return new EvalFilePaths(relativeSourceFilePath, extractFile);
+    }
+    //call this if the crawler is crawling through the src directory
+    protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path 
srcDir,
+                                                 Path extractDir) {
+        Path relativeSourceFilePath = 
Paths.get(metadata.get(FSProperties.FS_REL_PATH));
+        Path extractFile = findFile(extractDir, relativeSourceFilePath);
+        Path inputFile = srcDir.resolve(relativeSourceFilePath);
+        long srcLen = -1l;
+        //try to get the length of the source file in case there was an error
+        //in both extracts
+        try {
+            srcLen = Files.size(inputFile);
+        } catch (IOException e) {
+            logger.warn("Couldn't get length for: 
"+inputFile.toAbsolutePath());
+        }
+        return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen);
+    }
+
+    /**
+     *
+     * @param extractRootDir
+     * @param relativeSourceFilePath
+     * @return extractFile or null if couldn't find one.
+     */
+    private Path findFile(Path extractRootDir, Path relativeSourceFilePath) {
+        String relSrcFilePathString = relativeSourceFilePath.toString();
+        if (lastExtractExtension != null) {
+            Path candidate = 
extractRootDir.resolve(relSrcFilePathString+lastExtractExtension);
+            if (Files.isRegularFile(candidate)) {
+                return candidate;
+            }
+        }
+        for (String ext : EXTRACT_EXTENSIONS) {
+            for (String compress : COMPRESSION_EXTENSIONS) {
+                Path candidate = 
extractRootDir.resolve(relSrcFilePathString+ext+compress);
+                if (Files.isRegularFile(candidate)) {
+                    lastExtractExtension = ext+compress;
+                    return candidate;
+                }
+            }
+        }
+        return null;
+    }
+
+    protected long getSourceFileLength(EvalFilePaths fps, List<Metadata> 
metadataList) {
+        if (fps.getSourceFileLength() > NON_EXISTENT_FILE_LENGTH) {
+            return fps.getSourceFileLength();
+        }
+        return getSourceFileLength(metadataList);
+    }
+
+    long getSourceFileLength(List<Metadata> metadataList) {
+        if (metadataList == null || metadataList.size() < 1) {
+            return NON_EXISTENT_FILE_LENGTH;
+        }
+        return getSourceFileLength(metadataList.get(0));
+    }
+
+    long getSourceFileLength(Metadata m) {
+        String lenString = m.get(Metadata.CONTENT_LENGTH);
+        if (lenString == null) {
+            return NON_EXISTENT_FILE_LENGTH;
+        }
+        try {
+            return Long.parseLong(lenString);
+        } catch (NumberFormatException e) {
+            //swallow
+        }
+        return NON_EXISTENT_FILE_LENGTH;
+    }
+
+    protected long getFileLength(Path p) {
+        if (p != null && Files.isRegularFile(p)) {
+            try {
+                return Files.size(p);
+            } catch (IOException e) {
+                //swallow
+            }
+        }
+        return NON_EXISTENT_FILE_LENGTH;
+    }
+
+    /**
+     *
+     * @param list
+     * @return empty list if input list is empty or null
+     */
+    static List<Integer> countAttachments(List<Metadata> list) {
+        List<Integer> ret = new ArrayList<>();
+        if (list == null || list.size() == 0) {
+            return ret;
+        }
+        //container document attachment count = list.size()-1
+        ret.add(list.size()-1);
+
+        Map<String, Integer> counts = new HashMap<>();
+        for (int i = 1; i < list.size(); i++) {
+            String path = 
list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
+            if (path == null) {
+                //shouldn't ever happen
+                continue;
+            }
+            String[] parts = path.split("/");
+            StringBuilder parent = new StringBuilder();
+            for (int end = 1; end < parts.length-1; end++) {
+                parent.setLength(0);
+                join("/", parent, parts, 1, end);
+                String parentPath = parent.toString();
+                Integer count = counts.get(parentPath);
+                if (count == null) {
+                    count = 1;
+                } else {
+                    count++;
+                }
+                counts.put(parentPath, count);
+            }
+        }
+
+        for (int i = 1; i < list.size(); i++) {
+            Integer count = 
counts.get(list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+            if (count == null) {
+                count = 0;
+            }
+            ret.add(i, count);
+        }
+        return ret;
+
+
+    }
+
+    private static void join(String delimiter, StringBuilder sb, String[] 
parts, int start, int end) {
+        for (int i = start; i <= end; i++) {
+            sb.append(delimiter);
+            sb.append(parts[i]);
+        }
+    }
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java 
b/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java
new file mode 100644
index 0000000..1a3d29c
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval;
+
+import static org.apache.tika.eval.AbstractProfiler.NON_EXISTENT_FILE_LENGTH;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+/**
+ * Simple struct to keep track of relative path of source file (
+ * original binary file, e.g. /subdir/document1.doc)
+ * and the extract file (e.g. /subdir/document1.doc.json).
+ */
+class EvalFilePaths {
+
+    private final Path relativeSourceFilePath;
+    private final Path extractFile;
+
+    private long sourceFileLength = NON_EXISTENT_FILE_LENGTH;
+    private long extractFileLength = NON_EXISTENT_FILE_LENGTH;
+
+
+    public EvalFilePaths(Path relativeSourceFilePath, Path extractFile, long 
srcFileLen) {
+        this(relativeSourceFilePath, extractFile);
+        this.sourceFileLength = srcFileLen;
+    }
+
+    public EvalFilePaths(Path relativeSourceFilePath, Path extractFile) {
+        if (extractFile != null && Files.isRegularFile(extractFile)) {
+            try {
+                extractFileLength = Files.size(extractFile);
+            } catch (IOException e) {
+                //swallow ?
+            }
+        }
+        this.relativeSourceFilePath = relativeSourceFilePath;
+        this.extractFile = extractFile;
+    }
+
+    public Path getRelativeSourceFilePath() {
+        return relativeSourceFilePath;
+    }
+
+    //this path may or may not exist and it could be null!
+    public Path getExtractFile() {
+        return extractFile;
+    }
+
+    //if it doesn't exist, it'll be -1l.
+    public long getSourceFileLength() {
+        return sourceFileLength;
+    }
+
+    public long getExtractFileLength() {
+        return extractFileLength;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        EvalFilePaths that = (EvalFilePaths) o;
+
+        if (sourceFileLength != that.sourceFileLength) return false;
+        if (extractFileLength != that.extractFileLength) return false;
+        if (relativeSourceFilePath != null ? 
!relativeSourceFilePath.equals(that.relativeSourceFilePath) : 
that.relativeSourceFilePath != null)
+            return false;
+        return extractFile != null ? extractFile.equals(that.extractFile) : 
that.extractFile == null;
+
+    }
+
+    @Override
+    public int hashCode() {
+        int result = relativeSourceFilePath != null ? 
relativeSourceFilePath.hashCode() : 0;
+        result = 31 * result + (extractFile != null ? extractFile.hashCode() : 
0);
+        result = 31 * result + (int) (sourceFileLength ^ (sourceFileLength >>> 
32));
+        result = 31 * result + (int) (extractFileLength ^ (extractFileLength 
>>> 32));
+        return result;
+    }
+
+    @Override
+    public String toString() {
+        return "EvalFilePaths{" +
+                "relativeSourceFilePath=" + relativeSourceFilePath +
+                ", extractFile=" + extractFile +
+                ", sourceFileLength=" + sourceFileLength +
+                ", extractFileLength=" + extractFileLength +
+                '}';
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java 
b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
new file mode 100644
index 0000000..8b3d266
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -0,0 +1,455 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.io.FilenameUtils;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.db.ColInfo;
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.eval.io.ExtractReader;
+import org.apache.tika.eval.io.IDBWriter;
+import org.apache.tika.eval.tokens.ContrastStatistics;
+import org.apache.tika.eval.tokens.TokenContraster;
+import org.apache.tika.eval.tokens.TokenIntPair;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.RecursiveParserWrapper;
+
+public class ExtractComparer extends AbstractProfiler {
+
+    static Options OPTIONS;
+    static {
+        Option extractsA = new Option("extractsA", true, "directory for 
extractsA files");
+        extractsA.setRequired(true);
+
+        Option extractsB = new Option("extractsB", true, "directory for 
extractsB files");
+        extractsB.setRequired(true);
+
+        Option db = new Option("db", true, "db file to which to write 
results");
+        db.setRequired(true);
+
+        Option inputDir = new Option("inputDir", true,
+                "optional: directory of original binary input files if it 
exists " +
+                        "or can be the same as -extractsA or -extractsB. If 
not specified, -inputDir=-extractsA");
+        inputDir.setRequired(true);
+
+
+        OPTIONS = new Options()
+                .addOption(extractsA)
+                .addOption(extractsB)
+                .addOption(db)
+                .addOption(inputDir)
+                .addOption("bc", "optional: tika-batch config file")
+                .addOption("numConsumers", true, "optional: number of consumer 
threads")
+                .addOption(new Option("alterExtract", true,
+                        "for json-formatted extract files, " +
+                                "process full metadata list ('as_is'=default), 
" +
+                                "take just the first/container document 
('first_only'), " +
+                                "concatenate all content into the first 
metadata item ('concatenate_content')")
+                );
+    }
+
+    public static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(
+                80,
+                "java -jar tika-eval-x.y.jar Compare -extractsA extractsA 
-extractsB extractsB -db mydb",
+                "Tool: Compare",
+                ExtractComparer.OPTIONS,
+                "Note: for h2 db, do not include the .mv.db at the end of the 
db name.");
+    }
+
+    private final static String FIELD_A = "fa";
+    private final static String FIELD_B = "fb";
+
+    public static TableInfo REF_PAIR_NAMES = new TableInfo("pair_names",
+            new ColInfo(Cols.DIR_NAME_A, Types.VARCHAR, 128),
+            new ColInfo(Cols.DIR_NAME_B, Types.VARCHAR, 128)
+    );
+
+    public static TableInfo COMPARISON_CONTAINERS = new TableInfo("containers",
+            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
+            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
+            new ColInfo(Cols.LENGTH, Types.BIGINT),
+            new ColInfo(Cols.EXTRACT_FILE_LENGTH_A, Types.BIGINT),
+            new ColInfo(Cols.EXTRACT_FILE_LENGTH_B, Types.BIGINT)
+    );
+
+    public static TableInfo CONTENT_COMPARISONS = new 
TableInfo("content_comparisons",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, Types.VARCHAR, 1024),
+            new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, Types.VARCHAR, 1024),
+            new ColInfo(Cols.TOP_10_MORE_IN_A, Types.VARCHAR, 1024),
+            new ColInfo(Cols.TOP_10_MORE_IN_B, Types.VARCHAR, 1024),
+            new ColInfo(Cols.DICE_COEFFICIENT, Types.FLOAT),
+            new ColInfo(Cols.OVERLAP, Types.FLOAT)
+    );
+
+    public static TableInfo PROFILES_A = new TableInfo( "profiles_a",
+            ExtractProfiler.PROFILE_TABLE.getColInfos());
+
+    public static TableInfo PROFILES_B = new TableInfo( "profiles_b",
+            ExtractProfiler.PROFILE_TABLE.getColInfos());
+
+    public static TableInfo EMBEDDED_FILE_PATH_TABLE_A = new TableInfo( 
"emb_path_a",
+            ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
+
+    public static TableInfo EMBEDDED_FILE_PATH_TABLE_B = new TableInfo( 
"emb_path_b",
+            ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
+
+
+    public static TableInfo CONTENTS_TABLE_A = new TableInfo( "contents_a",
+            ExtractProfiler.CONTENTS_TABLE.getColInfos());
+
+    public static TableInfo CONTENTS_TABLE_B = new TableInfo( "contents_b",
+            ExtractProfiler.CONTENTS_TABLE.getColInfos());
+
+    public static TableInfo EXCEPTION_TABLE_A = new TableInfo ("exceptions_a",
+            ExtractProfiler.EXCEPTION_TABLE.getColInfos());
+
+    public static TableInfo EXCEPTION_TABLE_B = new TableInfo ("exceptions_b",
+            ExtractProfiler.EXCEPTION_TABLE.getColInfos());
+
+    public static TableInfo ERROR_TABLE_A = new TableInfo("extract_errors_a",
+            ExtractProfiler.ERROR_TABLE.getColInfos());
+    public static TableInfo ERROR_TABLE_B = new TableInfo("extract_errors_b",
+            ExtractProfiler.ERROR_TABLE.getColInfos());
+
+
+    //need to parameterize?
+    private final TikaConfig config = TikaConfig.getDefaultConfig();
+
+    private final Path inputDir;
+    private final Path extractsA;
+    private final Path extractsB;
+
+    private final long minJsonLength;
+    private final long maxJsonLength;
+    private final ExtractReader.ALTER_METADATA_LIST alterExtractList;
+
+    private final TokenContraster tokenContraster = new TokenContraster();
+    private final ExtractReader extractReader = new ExtractReader();
+
+    public ExtractComparer(ArrayBlockingQueue<FileResource> queue,
+                           Path inputDir, Path extractsA, Path extractsB,
+                           IDBWriter writer, long minJsonLength,
+                           long maxJsonLength, 
ExtractReader.ALTER_METADATA_LIST alterExtractList) {
+        super(queue, writer);
+        this.minJsonLength = minJsonLength;
+        this.maxJsonLength = maxJsonLength;
+        this.inputDir = inputDir;
+        this.extractsA = extractsA;
+        this.extractsB = extractsB;
+        this.alterExtractList = alterExtractList;
+    }
+
+    @Override
+    public boolean processFileResource(FileResource fileResource) {
+        Metadata metadata = fileResource.getMetadata();
+        EvalFilePaths fpsA = null;
+        EvalFilePaths fpsB = null;
+
+        if (inputDir != null && (inputDir.equals(extractsA) ||
+                inputDir.equals(extractsB))) {
+            //crawling an extract dir
+            fpsA = getPathsFromExtractCrawl(metadata, extractsA);
+            fpsB = getPathsFromExtractCrawl(metadata, extractsB);
+
+        } else {
+            fpsA = getPathsFromSrcCrawl(metadata, inputDir, extractsA);
+            fpsB = getPathsFromSrcCrawl(metadata, inputDir, extractsB);
+        }
+
+            if (minJsonLength > -1) {
+                //if both files exist and are < minJsonLength, skip em
+                if (fpsA.getExtractFileLength() > NON_EXISTENT_FILE_LENGTH
+                        && fpsA.getExtractFileLength() < minJsonLength
+                        && fpsB.getExtractFileLength() > 
NON_EXISTENT_FILE_LENGTH
+                        && fpsB.getExtractFileLength() < minJsonLength) {
+                    return false;
+                }
+            }
+            if (maxJsonLength > -1) {
+                if ((fpsA.getExtractFileLength() > maxJsonLength) ||
+                        (fpsB.getExtractFileLength() > maxJsonLength)) {
+                    return false;
+                }
+            }
+
+
+        try {
+            compareFiles(fpsA, fpsB);
+        } catch (Throwable e) {
+            e.printStackTrace();
+            //this should be cataclysmic...
+            throw new RuntimeException("Exception while working on: " +
+                    metadata.get(FSProperties.FS_REL_PATH), e);
+        }
+        return true;
+    }
+
+    //protected for testing, should find better way so that this can be 
private!
+    protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws 
IOException {
+
+        List<Metadata> metadataListA =
+                extractReader.loadExtract(fpsA.getExtractFile(), 
alterExtractList);
+        List<Metadata> metadataListB =
+                extractReader.loadExtract(fpsB.getExtractFile(), 
alterExtractList);
+
+        //array indices for those metadata items handled in
+        //"that"
+        Set<Integer> handledB = new HashSet<>();
+        String containerID = Integer.toString(CONTAINER_ID.getAndIncrement());
+        //container table
+        Map<Cols, String> contData = new HashMap<>();
+        contData.put(Cols.CONTAINER_ID, containerID);
+        contData.put(Cols.FILE_PATH, 
fpsA.getRelativeSourceFilePath().toString());
+        long srcFileLength = getSourceFileLength(metadataListA, metadataListB);
+        contData.put(Cols.LENGTH,
+                srcFileLength > NON_EXISTENT_FILE_LENGTH ?
+                    Long.toString(srcFileLength) : "");
+        contData.put(Cols.FILE_EXTENSION,
+                
FilenameUtils.getExtension(fpsA.getRelativeSourceFilePath().getFileName().toString()));
+
+        long extractFileLengthA = getFileLength(fpsA.getExtractFile());
+        contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > 
NON_EXISTENT_FILE_LENGTH ?
+                Long.toString(extractFileLengthA) : "");
+
+        long extractFileLengthB = getFileLength(fpsA.getExtractFile());
+        contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > 
NON_EXISTENT_FILE_LENGTH ?
+                Long.toString(extractFileLengthB) : "");
+
+        writer.writeRow(COMPARISON_CONTAINERS, contData);
+
+        if (metadataListA == null) {
+            writeError(ERROR_TABLE_A, containerID, 
fpsA.getRelativeSourceFilePath().toString(),
+                    fpsA.getExtractFile());
+        }
+        if (metadataListB == null) {
+            writeError(ERROR_TABLE_B, containerID, 
fpsB.getRelativeSourceFilePath().toString(),
+                    fpsB.getExtractFile());
+        }
+
+        if (metadataListA == null && metadataListB == null) {
+            return;
+        }
+        List<Integer> numAttachmentsA = countAttachments(metadataListA);
+        List<Integer> numAttachmentsB = countAttachments(metadataListB);
+
+        //now get that metadata
+        if (metadataListA != null) {
+            for (int i = 0; i < metadataListA.size(); i++) {
+                String fileId = Integer.toString(ID.getAndIncrement());
+                Metadata metadataA = metadataListA.get(i);
+                Metadata metadataB = null;
+                //TODO: shouldn't be fileA!!!!
+                writeProfileData(fpsA, i, metadataA, fileId, containerID, 
numAttachmentsA, PROFILES_A);
+                writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
+                int matchIndex = getMatch(i, metadataListA, metadataListB);
+
+                if (matchIndex > -1) {
+                    metadataB = metadataListB.get(matchIndex);
+                    handledB.add(matchIndex);
+                }
+                if (metadataB != null) {
+                    writeProfileData(fpsB, i, metadataB, fileId, containerID, 
numAttachmentsB, PROFILES_B);
+                    writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
+                }
+                writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
+                //prep the token counting
+                tokenCounter.clear(FIELD_A);
+                tokenCounter.clear(FIELD_B);
+                //write content
+                try {
+                    writeContentData(fileId, metadataA, FIELD_A, 
CONTENTS_TABLE_A);
+                    writeContentData(fileId, metadataB, FIELD_B, 
CONTENTS_TABLE_B);
+                } catch (IOException e) {
+                    throw new RuntimeException(e);
+                }
+
+                //now run comparisons
+                if (tokenCounter.getTokenStatistics(FIELD_A).getTotalTokens() 
> 0
+                        && 
tokenCounter.getTokenStatistics(FIELD_B).getTotalTokens() > 0) {
+                    Map<Cols, String> data = new HashMap<>();
+                    data.put(Cols.ID, fileId);
+
+                    ContrastStatistics contrastStatistics =
+                            tokenContraster.calculateContrastStatistics(
+                            tokenCounter.getTokens(FIELD_A),
+                            tokenCounter.getTokenStatistics(FIELD_A),
+                            tokenCounter.getTokens(FIELD_B),
+                            tokenCounter.getTokenStatistics(FIELD_B));
+
+                    writeContrasts(data, contrastStatistics);
+                    writer.writeRow(CONTENT_COMPARISONS, data);
+                }
+            }
+        }
+        //now try to get any Metadata objects in "that"
+        //that haven't yet been handled.
+        if (metadataListB != null) {
+            for (int i = 0; i < metadataListB.size(); i++) {
+                if (handledB.contains(i)) {
+                    continue;
+                }
+                Metadata metadataB = metadataListB.get(i);
+                String fileId = Integer.toString(ID.getAndIncrement());
+                writeProfileData(fpsB, i, metadataB, fileId, containerID, 
numAttachmentsB, PROFILES_B);
+                writeEmbeddedFilePathData(i, fileId, null, metadataB);
+                writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
+
+                //prep the token counting
+                tokenCounter.clear(FIELD_B);
+                //write content
+                try {
+                    writeContentData(fileId, metadataB, FIELD_B, 
CONTENTS_TABLE_B);
+                } catch (IOException e) {
+                    throw new RuntimeException(e);
+                }
+            }
+        }
+    }
+
+    private void writeEmbeddedFilePathData(int i, String fileId, Metadata mA, 
Metadata mB) {
+        //container file, don't write anything
+        if (i == 0) {
+            return;
+        }
+        String pathA = null;
+        String pathB = null;
+        if (mA != null) {
+            pathA = mA.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
+        }
+        if (mB != null) {
+            pathB = mB.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
+        }
+        if (pathA != null) {
+            Map<Cols, String> d = new HashMap<>();
+            d.put(Cols.ID, fileId);
+            d.put(Cols.EMBEDDED_FILE_PATH, pathA);
+            try {
+                writer.writeRow(EMBEDDED_FILE_PATH_TABLE_A, d);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+        if (pathB != null &&
+                (pathA == null || ! pathA.equals(pathB))) {
+            Map<Cols, String> d = new HashMap<>();
+            d.put(Cols.ID, fileId);
+            d.put(Cols.EMBEDDED_FILE_PATH, pathB);
+            try {
+                writer.writeRow(EMBEDDED_FILE_PATH_TABLE_B, d);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    private long getSourceFileLength(List<Metadata> metadataListA, 
List<Metadata> metadataListB) {
+        long len = getSourceFileLength(metadataListA);
+        if (len > NON_EXISTENT_FILE_LENGTH) {
+            return len;
+        }
+        return getSourceFileLength(metadataListB);
+    }
+
+
+    /**
+     * Try to find the matching metadata based on the 
RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH
+     * If you can't find it, return -1;
+     *
+     * @param i                index for match in metadataListA
+     * @param metadataListA
+     * @param metadataListB
+     * @return
+     */
+    private int getMatch(int i,
+                         List<Metadata> metadataListA,
+                         List<Metadata> metadataListB) {
+        //TODO: could make this more robust
+        if (metadataListB == null || metadataListB.size() == 0) {
+            return -1;
+        }
+        if (i == 0) {
+            return 0;
+        }
+        if (metadataListA.size() == metadataListB.size()) {
+            //assume no rearrangments if lists are the same size
+            return i;
+        }
+
+        Metadata thisMetadata = metadataListA.get(i);
+        String embeddedPath = 
thisMetadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
+        if (embeddedPath == null) {
+            return -1;
+        }
+        if (i < metadataListB.size()) {
+        }
+
+        for (int j = 0; j < metadataListB.size(); j++) {
+            String thatEmbeddedPath = metadataListB.get(j).get(
+                    RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
+            if (embeddedPath.equals(thatEmbeddedPath)) {
+                return j;
+            }
+        }
+        return -1;
+    }
+
+
+
+
+    private void writeContrasts(Map<Cols, String> data, ContrastStatistics 
contrastStatistics) {
+        writeContrastString(data, Cols.TOP_10_MORE_IN_A, 
contrastStatistics.getTopNMoreA());
+        writeContrastString(data, Cols.TOP_10_MORE_IN_B, 
contrastStatistics.getTopNMoreB());
+        writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, 
contrastStatistics.getTopNUniqueA());
+        writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, 
contrastStatistics.getTopNUniqueB());
+        data.put(Cols.OVERLAP, 
Double.toString(contrastStatistics.getOverlap()));
+        data.put(Cols.DICE_COEFFICIENT, 
Double.toString(contrastStatistics.getDiceCoefficient()));
+
+    }
+
+    private void writeContrastString(Map<Cols, String> data, Cols col, 
TokenIntPair[] tokenIntPairs) {
+
+        int i = 0;
+        StringBuilder sb = new StringBuilder();
+        for (TokenIntPair p : tokenIntPairs) {
+            if (i++ > 0) {
+                sb.append(" | ");
+            }
+            sb.append(p.getToken()).append(": ").append(p.getValue());
+        }
+        data.put(col, sb.toString());
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java 
b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
new file mode 100644
index 0000000..6840926
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -0,0 +1,238 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval;
+
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.eval.db.ColInfo;
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.eval.io.ExtractReader;
+import org.apache.tika.eval.io.IDBWriter;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.RecursiveParserWrapper;
+
+public class ExtractProfiler extends AbstractProfiler {
+
+    static Options OPTIONS;
+    static {
+        Option db = new Option("db", true, "db file to which to write 
results");
+        db.setRequired(true);
+
+        //By the time this commandline is parsed, there should be both an 
extractDir and an inputDir
+        Option extractDir = new Option("extractDir", true, "directory for 
extract files");
+        extractDir.setRequired(true);
+
+        Option inputDir = new Option("inputDir", true,
+                "optional: directory for original binary input documents."+
+        " If not specified, -extractDir is crawled as is.");
+        inputDir.setRequired(true);
+
+        OPTIONS = new Options()
+                .addOption(db)
+                .addOption(extractDir)
+                .addOption(inputDir)
+                .addOption("bc", "optional: tika-batch config file")
+                .addOption("numConsumers", true, "optional: number of consumer 
threads")
+                .addOption(new Option("alterExtract", true,
+                        "for json-formatted extract files, " +
+                                "process full metadata list ('as_is'=default), 
" +
+                                "take just the first/container document 
('first_only'), " +
+                                "concatenate all content into the first 
metadata item ('concatenate_content')"));
+
+    }
+
+    public static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(
+                80,
+                "java -jar tika-eval-x.y.jar Profile -extractDir extracts -db 
mydb [-inputDir input]",
+                "Tool: Profile",
+                ExtractProfiler.OPTIONS,
+                "Note: for h2 db, do not include the .mv.db at the end of the 
db name.");
+    }
+
+    private final static String FIELD = "f";
+
+    public static TableInfo ERROR_TABLE = new TableInfo("errors",
+            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER),
+            new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
+            new ColInfo(Cols.EXTRACT_ERROR_TYPE_ID, Types.INTEGER),
+            new ColInfo(Cols.PARSE_ERROR_TYPE_ID, Types.INTEGER)
+    );
+
+    public static TableInfo EXCEPTION_TABLE = new TableInfo("parse_exceptions",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.ORIG_STACK_TRACE, Types.VARCHAR, 8192),
+            new ColInfo(Cols.SORT_STACK_TRACE, Types.VARCHAR, 8192),
+            new ColInfo(Cols.PARSE_EXCEPTION_TYPE_ID, Types.INTEGER)
+    );
+
+
+    public static TableInfo CONTAINER_TABLE = new TableInfo("containers",
+            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN),
+            new ColInfo(Cols.LENGTH, Types.BIGINT),
+            new ColInfo(Cols.EXTRACT_FILE_LENGTH, Types.BIGINT)
+    );
+
+    public static TableInfo PROFILE_TABLE = new TableInfo("profiles",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.CONTAINER_ID, Types.INTEGER),
+            new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 256),
+            new ColInfo(Cols.MD5, Types.CHAR, 32),
+            new ColInfo(Cols.LENGTH, Types.BIGINT),
+            new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN),
+            new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12),
+            new ColInfo(Cols.MIME_TYPE_ID, Types.INTEGER),
+            new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER),
+            new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER),
+            new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER),
+            new ColInfo(Cols.NUM_PAGES, Types.INTEGER),
+            new ColInfo(Cols.HAS_CONTENT, Types.BOOLEAN)
+    );
+
+    public static TableInfo EMBEDDED_FILE_PATH_TABLE = new 
TableInfo("emb_file_names",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024)
+    );
+
+    public static TableInfo CONTENTS_TABLE = new TableInfo("contents",
+            new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"),
+            new ColInfo(Cols.CONTENT_LENGTH, Types.INTEGER),
+            new ColInfo(Cols.NUM_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.NUM_UNIQUE_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.COMMON_TOKENS_LANG, Types.VARCHAR, 12),
+            new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024),
+            new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER),
+            new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12),
+            new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT),
+            new ColInfo(Cols.LANG_ID_2, Types.VARCHAR, 12),
+            new ColInfo(Cols.LANG_ID_PROB_2, Types.FLOAT),
+            new ColInfo(Cols.UNICODE_CHAR_BLOCKS, Types.VARCHAR, 1024),
+            new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT),
+            new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER),
+            new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT),
+            new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT)
+    );
+
+    private final Path inputDir;
+    private final Path extractDir;
+    private final ExtractReader.ALTER_METADATA_LIST alterExtractList;
+    private final ExtractReader extractReader = new ExtractReader();
+
+    public ExtractProfiler(ArrayBlockingQueue<FileResource> queue,
+                           Path inputDir, Path extractDir,
+                           IDBWriter dbWriter, 
ExtractReader.ALTER_METADATA_LIST alterExtractList) {
+        super(queue, dbWriter);
+        this.inputDir = inputDir;
+        this.extractDir = extractDir;
+        this.alterExtractList = alterExtractList;
+    }
+
+    @Override
+    public boolean processFileResource(FileResource fileResource) {
+        Metadata metadata = fileResource.getMetadata();
+        EvalFilePaths fps = null;
+
+        if (inputDir != null && inputDir.equals(extractDir)) {
+            //crawling an extract dir
+            fps = getPathsFromExtractCrawl(metadata, extractDir);
+        } else {
+            fps = getPathsFromSrcCrawl(metadata, inputDir, extractDir);
+        }
+        List<Metadata> metadataList = 
extractReader.loadExtract(fps.getExtractFile(), alterExtractList);
+
+        Map<Cols, String> contOutput = new HashMap<>();
+        String containerId = Integer.toString(CONTAINER_ID.incrementAndGet());
+        Long srcFileLen = getSourceFileLength(fps, metadataList);
+        contOutput.put(Cols.LENGTH,
+                srcFileLen > NON_EXISTENT_FILE_LENGTH ?
+                        Long.toString(srcFileLen): "");
+        contOutput.put(Cols.CONTAINER_ID, containerId);
+        contOutput.put(Cols.FILE_PATH, 
fps.getRelativeSourceFilePath().toString());
+
+        if (fps.getExtractFileLength() > 0) {
+            contOutput.put(Cols.EXTRACT_FILE_LENGTH,
+                    (fps.getExtractFile() == null) ?
+                            "" :
+                    Long.toString(fps.getExtractFileLength()));
+        }
+        try {
+            writer.writeRow(CONTAINER_TABLE, contOutput);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+
+
+        if (metadataList == null) {
+            try {
+                writeError(ERROR_TABLE, containerId,
+                        fps.getRelativeSourceFilePath().toString(), 
fps.getExtractFile());
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+            return true;
+        }
+
+        //TODO: calculate num_attachments, add to profile table
+
+        List<Integer> numAttachments = countAttachments(metadataList);
+        int i = 0;
+        for (Metadata m : metadataList) {
+            String fileId = Integer.toString(ID.incrementAndGet());
+            writeProfileData(fps, i, m, fileId, containerId, numAttachments, 
PROFILE_TABLE);
+            writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE);
+            writeExceptionData(fileId, m, EXCEPTION_TABLE);
+            try {
+                writeContentData(fileId, m, FIELD, CONTENTS_TABLE);
+            } catch (IOException e) {
+                throw new RuntimeException(e);
+            }
+            i++;
+        }
+        return true;
+    }
+
+    private void writeEmbeddedPathData(int i, String fileId, Metadata m,
+                                       TableInfo embeddedFilePathTable) {
+        if (i == 0) {
+            return;
+        }
+        Map<Cols, String> data = new HashMap<>();
+        data.put(Cols.ID, fileId);
+        data.put(Cols.EMBEDDED_FILE_PATH,
+                m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
+        try {
+            writer.writeRow(embeddedFilePathTable, data);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+}

[6/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

Reply via email to