http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java 
b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
new file mode 100644
index 0000000..5860327
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.ParseException;
+import org.apache.tika.batch.fs.FSBatchProcessCLI;
+import org.apache.tika.eval.reports.ResultsReporter;
+import org.h2.tools.Console;
+
+public class TikaEvalCLI {
+    static final String[] tools = {"Profile", "Compare", "Report", "StartDB"};
+
+    private static String specifyTools() {
+        StringBuilder sb = new StringBuilder();
+        sb.append("Must specify one of the following tools in the first 
parameter:\n");
+        for (String s : tools) {
+            sb.append(s+"\n");
+        }
+        return sb.toString();
+
+    }
+
+    private void execute(String[] args) throws Exception {
+        String tool = args[0];
+        String[] subsetArgs = new String[args.length-1];
+        System.arraycopy(args, 1, subsetArgs, 0, args.length - 1);
+        if (tool.equals("Report")) {
+            handleReport(subsetArgs);
+        } else if (tool.equals("Compare")) {
+            handleCompare(subsetArgs);
+        } else if (tool.equals("Profile")) {
+            handleProfile(subsetArgs);
+        } else if (tool.equals("StartDB")) {
+            handleStartDB(subsetArgs);
+        } else {
+            System.out.println(specifyTools());
+        }
+    }
+
+    private void handleStartDB(String[] args) throws SQLException {
+        List<String> argList = new ArrayList<>();
+        argList.add("-web");
+        Console.main(argList.toArray(new String[argList.size()]));
+        while(true) {
+            try {
+                Thread.sleep(1000);
+            } catch (InterruptedException e){
+                break;
+            }
+        }
+    }
+
+    private void handleProfile(String[] subsetArgs) throws Exception {
+        List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
+
+        boolean containsBC = false;
+        String inputDir = null;
+        String extractDir = null;
+        String alterExtract = null;
+        //confirm there's a batch-config file
+        for (int i = 0; i < argList.size(); i++) {
+            String arg = argList.get(i);
+            if (arg.equals("-bc")) {
+                containsBC = true;
+            } else if (arg.equals("-inputDir")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify directory after 
-inputDir");
+                    ExtractProfiler.USAGE();
+                    return;
+                }
+                inputDir = argList.get(i+1);
+                i++;
+            } else if (arg.equals("-extractDir")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify directory after 
-extractDir");
+                    ExtractProfiler.USAGE();
+                    return;
+                }
+                extractDir = argList.get(i+1);
+                i++;
+            } else if (arg.equals("-alterExtract")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify directory after 
-extractsB");
+                    ExtractComparer.USAGE();
+                    return;
+                }
+                alterExtract = argList.get(i+1);
+                i++;
+            }
+        }
+
+        if (alterExtract != null && !alterExtract.equals("as_is") &&
+                !alterExtract.equals("concatenate_content") &&
+                !alterExtract.equals("first_only")) {
+            System.out.println("Sorry, I don't understand:"+alterExtract+
+                    ". The values must be one of: as_is, first_only, 
concatenate_content");
+            ExtractProfiler.USAGE();
+            return;
+        }
+
+        //need to specify each in this commandline
+        //if only extractDir is passed to tika-batch,
+        //the crawler will see no inputDir and start crawling "input".
+        //this allows the user to specify either extractDir or inputDir
+        if (extractDir == null && inputDir != null) {
+            argList.add("-extractDir");
+            argList.add(inputDir);
+        } else if (inputDir == null && extractDir != null) {
+            argList.add("-inputDir");
+            argList.add(extractDir);
+        }
+
+        Path tmpBCConfig = null;
+        try {
+            tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
+            if (! containsBC) {
+                Files.copy(
+                        
this.getClass().getResourceAsStream("/tika-eval-profiler-config.xml"),
+                        tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
+                argList.add("-bc");
+                argList.add(tmpBCConfig.toAbsolutePath().toString());
+            }
+
+            String[] updatedArgs = argList.toArray(new String[argList.size()]);
+            DefaultParser defaultCLIParser = new DefaultParser();
+            try {
+                defaultCLIParser.parse(ExtractProfiler.OPTIONS, updatedArgs);
+            } catch (ParseException e) {
+                System.out.println(e.getMessage()+"\n");
+                ExtractProfiler.USAGE();
+                return;
+            }
+
+            FSBatchProcessCLI.main(updatedArgs);
+        } finally {
+            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
+                Files.delete(tmpBCConfig);
+            }
+        }
+    }
+
+    private void handleCompare(String[] subsetArgs) throws Exception{
+        List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
+
+        boolean containsBC = false;
+        String inputDir = null;
+        String extractsA = null;
+        String alterExtract = null;
+        //confirm there's a batch-config file
+        for (int i = 0; i < argList.size(); i++) {
+            String arg = argList.get(i);
+            if (arg.equals("-bc")) {
+                containsBC = true;
+            } else if (arg.equals("-inputDir")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify directory after 
-inputDir");
+                    ExtractComparer.USAGE();
+                    return;
+                }
+                inputDir = argList.get(i+1);
+                i++;
+            } else if (arg.equals("-extractsA")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify directory after 
-extractsA");
+                    ExtractComparer.USAGE();
+                    return;
+                }
+                extractsA = argList.get(i+1);
+                i++;
+            } else if (arg.equals("-alterExtract")) {
+                if (i+1 >= argList.size()) {
+                    System.err.println("Must specify directory after 
-extractsB");
+                    ExtractComparer.USAGE();
+                    return;
+                }
+                alterExtract = argList.get(i+1);
+                i++;
+            }
+        }
+        if (alterExtract != null && !alterExtract.equals("as_is") &&
+                !alterExtract.equals("concatenate_content") &&
+                !alterExtract.equals("first_only")) {
+            System.out.println("Sorry, I don't understand:"+alterExtract+
+            ". The values must be one of: as_is, first_only, 
concatenate_content");
+            ExtractComparer.USAGE();
+            return;
+        }
+
+        //need to specify each in the commandline that goes into tika-batch
+        //if only extractDir is passed to tika-batch,
+        //the crawler will see no inputDir and start crawling "input".
+        //if the user doesn't specify inputDir, crawl extractsA
+        if (inputDir == null && extractsA != null) {
+            argList.add("-inputDir");
+            argList.add(extractsA);
+        }
+
+        Path tmpBCConfig = null;
+        try {
+            tmpBCConfig = Files.createTempFile("tika-eval", ".xml");
+            if (! containsBC) {
+                Files.copy(
+                        
this.getClass().getResourceAsStream("/tika-eval-comparison-config.xml"),
+                        tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
+                argList.add("-bc");
+                argList.add(tmpBCConfig.toAbsolutePath().toString());
+
+            }
+            String[] updatedArgs = argList.toArray(new String[argList.size()]);
+            DefaultParser defaultCLIParser = new DefaultParser();
+            try {
+                defaultCLIParser.parse(ExtractComparer.OPTIONS, updatedArgs);
+            } catch (ParseException e) {
+                System.out.println(e.getMessage()+"\n");
+                ExtractComparer.USAGE();
+                return;
+            }
+
+            FSBatchProcessCLI.main(updatedArgs);
+        } finally {
+            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
+                Files.delete(tmpBCConfig);
+            }
+        }
+    }
+
+    private void handleReport(String[] subsetArgs) throws Exception {
+        ResultsReporter.main(subsetArgs);
+    }
+
+    public static void main(String[] args) throws Exception {
+        TikaEvalCLI cli = new TikaEvalCLI();
+        if (args.length == 0) {
+            System.err.println(specifyTools());
+            return;
+        }
+        cli.execute(args);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java 
b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
new file mode 100644
index 0000000..9a7e7aa
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
@@ -0,0 +1,226 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval;
+
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+
+import org.apache.log4j.Level;
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.DBUtil;
+import org.apache.tika.eval.db.H2Util;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.eval.io.XMLLogMsgHandler;
+import org.apache.tika.eval.io.XMLLogReader;
+import org.apache.tika.eval.reports.ResultsReporter;
+import org.apache.tika.io.IOExceptionWithCause;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a very task specific class that reads a log file and updates
+ * the "comparisons" table.  It should not be run in a multithreaded 
environment.
+ */
+public class XMLErrorLogUpdater {
+
+    protected static Logger LOGGER = 
LoggerFactory.getLogger(ResultsReporter.class);
+
+    private Statement statement;
+
+    public static void main(String[] args) throws Exception {
+
+        XMLErrorLogUpdater writer = new XMLErrorLogUpdater();
+        Path xmlLogFileA = Paths.get(args[0]);
+        Path xmlLogFileB = Paths.get(args[1]);
+        Path db = Paths.get(args[2]);
+        DBUtil dbUtil = new H2Util(db);
+        Connection connection = dbUtil.getConnection(true);
+        writer.update(connection, ExtractComparer.ERROR_TABLE_A, xmlLogFileA);
+        writer.update(connection, ExtractComparer.ERROR_TABLE_B, xmlLogFileB);
+        connection.commit();
+        connection.close();
+    }
+
+    public void update(Connection connection, TableInfo tableInfo, Path 
xmlLogFile) throws Exception {
+        statement = connection.createStatement();
+        XMLLogReader reader = new XMLLogReader();
+        try (InputStream is = Files.newInputStream(xmlLogFile)) {
+            reader.read(is, new ErrorMsgUpdater(tableInfo.getName()));
+        } catch (IOException e) {
+            throw new RuntimeException("Problem reading: 
"+xmlLogFile.toAbsolutePath().toString());
+        } finally {
+            try {
+                connection.commit();
+                statement.close();
+            } catch (SQLException e) {
+                throw new RuntimeException("Failed to close db connection!", 
e);
+            }
+        }
+    }
+
+    private class ErrorMsgUpdater implements XMLLogMsgHandler {
+        private final String errorTablename;
+
+        private ErrorMsgUpdater(String errorTablename) {
+            this.errorTablename = errorTablename;
+        }
+
+        @Override
+        public void handleMsg(Level level, String xml) throws SQLException, 
IOException {
+            if (! level.equals(Level.ERROR)) {
+                return;
+            }
+            XMLStreamReader reader = null;
+            try {
+                reader = 
XMLInputFactory.newInstance().createXMLStreamReader(new StringReader(xml));
+            } catch (XMLStreamException e) {
+                throw new IOExceptionWithCause(e);
+            }
+            String type = null;
+            String resourceId = null;
+            try {
+                while (reader.hasNext() && type == null && resourceId == null) 
{
+                    reader.next();
+                    switch (reader.getEventType()) {
+                        case XMLStreamConstants.START_ELEMENT:
+                            if ("timed_out".equals(reader.getLocalName())) {
+                                resourceId = reader.getAttributeValue("", 
"resourceId");
+                                update(errorTablename, resourceId,
+                                        
AbstractProfiler.PARSE_ERROR_TYPE.TIMEOUT);
+
+                            } else if ("oom".equals(reader.getLocalName())) {
+                                resourceId = reader.getAttributeValue("", 
"resourceId");
+                                update(errorTablename, resourceId, 
AbstractProfiler.PARSE_ERROR_TYPE.OOM);
+                            }
+                            break;
+                    }
+                }
+                reader.close();
+            } catch (XMLStreamException e) {
+                throw new IOExceptionWithCause(e);
+            }
+        }
+
+        private void update(String errorTableName,
+                            String filePath, AbstractProfiler.PARSE_ERROR_TYPE 
type) throws SQLException {
+            int containerId = getContainerId(filePath);
+            String sql = "SELECT count(1) from "+errorTableName +
+                    " where "+Cols.CONTAINER_ID +
+                    " = "+containerId + " or "+
+                    Cols.FILE_PATH + "='"+filePath+"'";
+            ResultSet rs = statement.executeQuery(sql);
+
+            //now try to figure out if that file already exists
+            //in parse errors
+            int hitCount = 0;
+            while (rs.next()) {
+                hitCount = rs.getInt(1);
+            }
+
+            //if it does, update all records matching that path or container id
+            if (hitCount > 0) {
+                sql = "UPDATE " + errorTableName +
+                        " SET " + Cols.PARSE_ERROR_TYPE_ID +
+                        " = " + type.ordinal() + ","+
+                        Cols.FILE_PATH + "='" +filePath+"'"+
+                        " where "+Cols.CONTAINER_ID +
+                        "="+containerId + " or "+
+                        Cols.FILE_PATH + "='"+filePath+"'";;
+
+            } else {
+                //if not and container id > -1
+                //insert full record
+                if (containerId > -1) {
+                    sql = "INSERT INTO " + errorTableName +
+                            " ("+Cols.CONTAINER_ID+","+Cols.FILE_PATH 
+","+Cols.PARSE_ERROR_TYPE_ID+")"+
+                            " values (" + containerId + ", '" + filePath + 
"'," +
+                            type.ordinal() + ");";
+                } else {
+                    //if container id == -1, insert only file path and parse 
error type id
+                    sql = "INSERT INTO " + errorTableName +
+                            " 
("+Cols.FILE_PATH.name()+","+Cols.PARSE_ERROR_TYPE_ID+")"+
+                            "values ('" + filePath + "'," +
+                            type.ordinal() + ");";
+                }
+
+            }
+            int updated = statement.executeUpdate(sql);
+            if (updated == 0) {
+                //TODO: log
+                LOGGER.warn("made no updates in xmlerrorlogupdater!");
+            } else if (updated > 1) {
+                LOGGER.warn("made too many updates");
+            }
+        }
+
+        private int getContainerId(String resourceId) throws SQLException {
+            int containerId = -1;
+            String sql = "SELECT " + Cols.CONTAINER_ID.name() +
+                    " from " + ExtractProfiler.CONTAINER_TABLE.getName()+
+                    " where " + Cols.FILE_PATH +
+                    " ='"+resourceId+"'";
+            ResultSet rs = statement.executeQuery(sql);
+            int resultCount = 0;
+            while (rs.next()) {
+                containerId = rs.getInt(1);
+                resultCount++;
+            }
+            rs.close();
+
+            if (resultCount == 0) {
+                LOGGER.warn("Should have found a container for: "+resourceId);
+            } else if (resultCount > 1) {
+                LOGGER.error("Records ids should be unique:"+resourceId);
+            }
+/*
+            if (containerId < 0) {
+                System.err.println("CONTAINER ID < 0!!!");
+                sql = "SELECT MAX("+ Cols.CONTAINER_ID.name() +
+                        ") from "+ExtractProfiler.CONTAINER_TABLE.getName();
+                rs = statement.executeQuery(sql);
+                while (rs.next()) {
+                    containerId = rs.getInt(1);
+                }
+                rs.close();
+                if (containerId < 0) {
+                    //log and abort
+                    //return -1?
+                } else {
+                    containerId++;
+                }
+
+            }*/
+            return containerId;
+        }
+
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java
new file mode 100644
index 0000000..2c655cc
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.batch;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.batch.ConsumersManager;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.eval.AbstractProfiler;
+import org.apache.tika.eval.XMLErrorLogUpdater;
+import org.apache.tika.eval.db.DBUtil;
+import org.apache.tika.eval.db.TableInfo;
+
+
+public class DBConsumersManager extends ConsumersManager {
+
+    private Connection conn;
+    List<LogTablePair> errorLogs = new ArrayList<>();
+
+    public DBConsumersManager(DBUtil dbUtil, List<FileResourceConsumer> 
consumers)
+            throws IOException {
+        super(consumers);
+        this.conn = dbUtil.getConnection(true);
+    }
+
+
+    @Override
+    public void shutdown() {
+
+        for (FileResourceConsumer consumer : getConsumers()) {
+            if (consumer instanceof AbstractProfiler) {
+                try{
+                    ((AbstractProfiler)consumer).closeWriter();
+                } catch (IOException e) {
+                    throw new RuntimeException(e);
+                }
+            }
+        }
+        //MUST HAPPEN AFTER consumers have closed and
+        //committed container information!!!
+        XMLErrorLogUpdater up = new XMLErrorLogUpdater();
+        for (LogTablePair p : errorLogs) {
+            try {
+                up.update(conn, p.tableInfo, p.log);
+            } catch (Exception e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        try {
+            conn.commit();
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        }
+        try {
+            conn.close();
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    public void addErrorLogTablePair(Path log, TableInfo tableName) {
+        LogTablePair p = new LogTablePair();
+        p.log = log;
+        p.tableInfo = tableName;
+        errorLogs.add(p);
+    }
+
+    class LogTablePair {
+        Path log;
+        TableInfo tableInfo;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
new file mode 100644
index 0000000..8135887
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.batch;
+
+import java.io.IOException;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.eval.AbstractProfiler;
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.DBUtil;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.eval.io.ExtractReader;
+import org.apache.tika.eval.io.IDBWriter;
+
+public abstract class EvalConsumerBuilder {
+    private AtomicInteger count = new AtomicInteger(0);
+    protected ArrayBlockingQueue<FileResource> queue;
+    Map<String, String> localAttrs;
+    DBUtil dbUtil;
+
+    public void init(ArrayBlockingQueue<FileResource> queue, Map<String, 
String> localAttrs,
+                     DBUtil dbUtil) {
+        this.queue = queue;
+        this.localAttrs = localAttrs;
+        this.dbUtil = dbUtil;
+    }
+
+    public abstract FileResourceConsumer build() throws IOException, 
SQLException;
+
+    protected abstract List<TableInfo> getTableInfo();
+
+    protected abstract IDBWriter getDBWriter() throws IOException, 
SQLException;
+
+    protected abstract void addErrorLogTablePairs(DBConsumersManager manager);
+
+    public void populateRefTables(IDBWriter writer) throws IOException, 
SQLException {
+        //figure out cleaner way of doing this!
+        if (count.getAndIncrement() > 0) {
+            return;
+        }
+        Map<Cols, String> m = new HashMap<>();
+        for (AbstractProfiler.PARSE_ERROR_TYPE t : 
AbstractProfiler.PARSE_ERROR_TYPE.values()) {
+            m.clear();
+            m.put(Cols.PARSE_ERROR_TYPE_ID, Integer.toString(t.ordinal()));
+            m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name());
+            writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m);
+        }
+
+        for (AbstractProfiler.EXCEPTION_TYPE t : 
AbstractProfiler.EXCEPTION_TYPE.values()) {
+            m.clear();
+            m.put(Cols.PARSE_EXCEPTION_TYPE_ID, Integer.toString(t.ordinal()));
+            m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name());
+            writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m);
+        }
+
+        for (AbstractProfiler.EXTRACT_ERROR_TYPE t :
+                AbstractProfiler.EXTRACT_ERROR_TYPE.values()) {
+            m.clear();
+            m.put(Cols.EXTRACT_ERROR_TYPE_ID, Integer.toString(t.ordinal()));
+            m.put(Cols.EXTRACT_ERROR_DESCRIPTION, t.name());
+            writer.writeRow(AbstractProfiler.REF_EXTRACT_ERROR_TYPES, m);
+        }
+
+    }
+
+    ExtractReader.ALTER_METADATA_LIST getAlterMetadata(Map<String, String> 
localAttrs) {
+
+        String alterExtractString = localAttrs.get("alterExtract");
+        ExtractReader.ALTER_METADATA_LIST alterExtractList = 
ExtractReader.ALTER_METADATA_LIST.AS_IS;
+        if (alterExtractString == null || 
alterExtractString.equalsIgnoreCase("as_is")) {
+            alterExtractList = ExtractReader.ALTER_METADATA_LIST.AS_IS;
+        } else if (alterExtractString.equalsIgnoreCase("first_only")) {
+            alterExtractList = ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY;
+        } else if (alterExtractString.equalsIgnoreCase("concatenate_content")) 
{
+            alterExtractList = 
ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST;
+        } else {
+            throw new RuntimeException("options for alterExtract: as_is, 
first_only, concatenate_content." +
+                    " I don't understand:" + alterExtractString);
+        }
+        return alterExtractList;
+    }
+
+
+/*
+    public abstract Map<String, String> getIndexInfo();
+
+    class ValueComparator implements Comparator<String> {
+
+        Map<String, ColInfo> map;
+
+        public ValueComparator(Map<String, ColInfo> base) {
+            this.map = base;
+        }
+
+        public int compare(String a, String b) {
+            Integer aVal = map.get(a).getDBColOffset();
+            Integer bVal = map.get(b).getDBColOffset();
+            if (aVal == null || bVal == null) {
+                throw new IllegalArgumentException("Column offset must be 
specified!");
+            }
+            if (aVal == bVal && ! map.get(a).equals(map.get(b))) {
+                throw new IllegalArgumentException("Column offsets must be 
unique: " + a + " and " + b + " both have: "+aVal);
+            }
+            if (aVal < bVal) {
+                return -1;
+            } else {
+                return 1;
+            }
+        }
+    }
+*/
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
new file mode 100644
index 0000000..00f4ad7
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.batch;
+
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.SQLException;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+
+import org.apache.tika.batch.ConsumersManager;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.batch.builders.AbstractConsumersBuilder;
+import org.apache.tika.batch.builders.BatchProcessBuilder;
+import org.apache.tika.eval.AbstractProfiler;
+import org.apache.tika.eval.db.DBUtil;
+import org.apache.tika.eval.db.H2Util;
+import org.apache.tika.eval.util.LanguageIDWrapper;
+import org.apache.tika.util.ClassLoaderUtil;
+import org.apache.tika.util.PropsUtil;
+import org.apache.tika.util.XMLDOMUtil;
+import org.w3c.dom.Node;
+
+public class EvalConsumersBuilder extends AbstractConsumersBuilder {
+
+    @Override
+    public ConsumersManager build(Node node, Map<String, String> 
runtimeAttributes,
+                                  ArrayBlockingQueue<FileResource> queue) {
+
+        List<FileResourceConsumer> consumers = new LinkedList<>();
+        int numConsumers = 
BatchProcessBuilder.getNumConsumers(runtimeAttributes);
+
+        Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, 
runtimeAttributes);
+
+
+        Path db = getPath(localAttrs, "db");
+        Path langModelDir = getPath(localAttrs, "langModelDir");
+
+        try {
+            if (langModelDir == null) {
+                LanguageIDWrapper.loadBuiltInModels();
+            } else {
+                LanguageIDWrapper.loadModels(langModelDir);
+            }
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+
+        Path commonTokens = getNonNullPath(localAttrs, "commonTokens");
+        try {
+            AbstractProfiler.loadCommonTokens(commonTokens);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+
+        boolean append = PropsUtil.getBoolean(localAttrs.get("dbAppend"), 
false);
+
+        if (db == null) {
+            throw new RuntimeException("Must specify: -db");
+        }
+        //parameterize which db util to use
+        DBUtil util = new H2Util(db);
+        EvalConsumerBuilder consumerBuilder = 
ClassLoaderUtil.buildClass(EvalConsumerBuilder.class,
+                PropsUtil.getString(localAttrs.get("consumerBuilderClass"), 
null));
+        if (consumerBuilder == null) {
+            throw new RuntimeException("Must specify consumerBuilderClass in 
config file");
+        }
+        consumerBuilder.init(queue, localAttrs, util);
+
+        try {
+            util.createDB(consumerBuilder.getTableInfo(), append);
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+        for (int i = 0; i < numConsumers; i++) {
+            try {
+                consumers.add(consumerBuilder.build());
+            } catch (IOException | SQLException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        DBConsumersManager manager;
+        try {
+            manager = new DBConsumersManager(util, consumers);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+        consumerBuilder.addErrorLogTablePairs(manager);
+
+        return manager;
+    }
+
+    private Path getNonNullPath(Map<String, String> attrs, String key) {
+        Path p = getPath(attrs, key);
+        if (p == null) {
+            throw new RuntimeException("Must specify a file for this 
attribute: "+key);
+        }
+        return p;
+    }
+
+
+    protected Path getPath(Map<String, String> attrs, String key) {
+        String filePath = attrs.get(key);
+        if (filePath == null) {
+            return null;
+        }
+        return Paths.get(filePath);
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/batch/FileComparerBuilder.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileComparerBuilder.java 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileComparerBuilder.java
new file mode 100644
index 0000000..cface16
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileComparerBuilder.java
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.batch;
+
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.AbstractProfiler;
+import org.apache.tika.eval.ExtractComparer;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.eval.io.DBWriter;
+import org.apache.tika.eval.io.ExtractReader;
+import org.apache.tika.eval.io.IDBWriter;
+import org.apache.tika.util.PropsUtil;
+
+public class FileComparerBuilder extends EvalConsumerBuilder {
+    private final static String WHICH_DB = "h2";//TODO: allow flexibility
+
+
+    @Override
+    public FileResourceConsumer build() throws IOException, SQLException {
+        Path thisRootDir = PropsUtil.getPath(localAttrs.get("extractsA"), 
null);
+        if (thisRootDir == null) {
+            throw new RuntimeException("Must specify \"extractsA\" -- 
directory for 'A' extracts");
+        }
+        Path thatRootDir = PropsUtil.getPath(localAttrs.get("extractsB"), 
null);
+        if (thatRootDir == null) {
+            throw new RuntimeException("Must specify \"extractsB\" -- 
directory for 'B' extracts");
+        }
+
+        Path inputRootDir = PropsUtil.getPath(localAttrs.get("inputDir"), 
null);
+
+        long minJsonLength = 
PropsUtil.getLong(localAttrs.get("minJsonFileSizeBytes"), -1L);
+        long maxJsonLength = 
PropsUtil.getLong(localAttrs.get("maxJsonFileSizeBytes"), -1L);
+
+        ExtractReader.ALTER_METADATA_LIST alterExtractList = 
getAlterMetadata(localAttrs);
+
+
+        IDBWriter writer = getDBWriter();
+        //TODO: clean up the writing of the ref tables!!!
+        try {
+            populateRefTables(writer);
+        } catch (SQLException e) {
+            throw new RuntimeException("Can't populate ref tables", e);
+        }
+
+        if (inputRootDir == null) {
+            //this is for the sake of the crawler
+            throw new RuntimeException("Must specify an -inputDir");
+        }
+
+        return new ExtractComparer(queue, inputRootDir, thisRootDir, 
thatRootDir, writer,
+                minJsonLength, maxJsonLength, alterExtractList);
+    }
+
+    @Override
+    protected List<TableInfo> getTableInfo() {
+        List<TableInfo> tableInfos = new ArrayList<>();
+        tableInfos.add(ExtractComparer.COMPARISON_CONTAINERS);
+        tableInfos.add(ExtractComparer.PROFILES_A);
+        tableInfos.add(ExtractComparer.PROFILES_B);
+        tableInfos.add(ExtractComparer.ERROR_TABLE_A);
+        tableInfos.add(ExtractComparer.ERROR_TABLE_B);
+        tableInfos.add(ExtractComparer.EXCEPTION_TABLE_A);
+        tableInfos.add(ExtractComparer.EXCEPTION_TABLE_B);
+        tableInfos.add(ExtractComparer.ERROR_TABLE_A);
+        tableInfos.add(ExtractComparer.ERROR_TABLE_B);
+        tableInfos.add(ExtractComparer.CONTENTS_TABLE_A);
+        tableInfos.add(ExtractComparer.CONTENTS_TABLE_B);
+        tableInfos.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A);
+        tableInfos.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B);
+
+        tableInfos.add(ExtractComparer.CONTENT_COMPARISONS);
+        tableInfos.add(AbstractProfiler.MIME_TABLE);
+        tableInfos.add(ExtractComparer.REF_PAIR_NAMES);
+        tableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
+        tableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
+        tableInfos.add(AbstractProfiler.REF_EXTRACT_ERROR_TYPES);
+        return tableInfos;
+    }
+
+    @Override
+    protected IDBWriter getDBWriter() throws IOException, SQLException {
+        return new DBWriter(getTableInfo(), TikaConfig.getDefaultConfig(), 
dbUtil);
+    }
+
+    @Override
+    protected void addErrorLogTablePairs(DBConsumersManager manager) {
+        Path errorLogA = PropsUtil.getPath(localAttrs.get("errorLogFileA"), 
null);
+        if (errorLogA == null) {
+            return;
+        }
+        manager.addErrorLogTablePair(errorLogA, ExtractComparer.ERROR_TABLE_A);
+        Path errorLogB = PropsUtil.getPath(localAttrs.get("errorLogFileB"), 
null);
+        if (errorLogB == null) {
+            return;
+        }
+        manager.addErrorLogTablePair(errorLogB, ExtractComparer.ERROR_TABLE_B);
+
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java
new file mode 100644
index 0000000..de8be64
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.batch;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.AbstractProfiler;
+import org.apache.tika.eval.ExtractProfiler;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.eval.io.DBWriter;
+import org.apache.tika.eval.io.ExtractReader;
+import org.apache.tika.eval.io.IDBWriter;
+import org.apache.tika.util.PropsUtil;
+
+
+public class SingleFileConsumerBuilder extends EvalConsumerBuilder {
+
+    @Override
+    public FileResourceConsumer build() throws IOException {
+        Path extractDir = PropsUtil.getPath(localAttrs.get("extractDir"), 
null);
+        if (extractDir == null) {
+            throw new RuntimeException("Must specify \"extractDir\" -- 
directory to crawl");
+        }
+        if (!Files.isDirectory(extractDir)) {
+            throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " +
+                    extractDir.toAbsolutePath());
+        }
+
+        Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
+
+        ExtractReader.ALTER_METADATA_LIST alterExtractList = 
getAlterMetadata(localAttrs);
+
+        IDBWriter writer = null;
+        try {
+            writer = getDBWriter();
+        } catch (SQLException ex) {
+            throw new IOException(ex);
+        }
+
+        //TODO: clean up the writing of the ref tables!!!
+        try {
+            populateRefTables(writer);
+        } catch (SQLException e) {
+            throw new RuntimeException("Can't populate ref tables", e);
+        }
+        //we _could_ set this to extractDir (if not null)
+        //here, but the Crawler defaults to "input" if nothing is passed
+        //so this won't work
+        if (inputDir == null) {
+            throw new RuntimeException("Must specify -inputDir");
+        }
+        if (extractDir == null && inputDir != null) {
+            extractDir = inputDir;
+        }
+        return new ExtractProfiler(queue, inputDir, extractDir, writer, 
alterExtractList);
+    }
+
+    @Override
+    protected List<TableInfo> getTableInfo() {
+        List<TableInfo> tableInfos = new ArrayList<TableInfo>();
+        tableInfos.add(AbstractProfiler.MIME_TABLE);
+        tableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES);
+        tableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES);
+        tableInfos.add(AbstractProfiler.REF_EXTRACT_ERROR_TYPES);
+        tableInfos.add(ExtractProfiler.CONTAINER_TABLE);
+        tableInfos.add(ExtractProfiler.PROFILE_TABLE);
+        tableInfos.add(ExtractProfiler.ERROR_TABLE);
+        tableInfos.add(ExtractProfiler.EXCEPTION_TABLE);
+        tableInfos.add(ExtractProfiler.CONTENTS_TABLE);
+        tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE);
+        return tableInfos;
+    }
+
+    @Override
+    protected IDBWriter getDBWriter() throws IOException, SQLException {
+        return new DBWriter(getTableInfo(), TikaConfig.getDefaultConfig(), 
dbUtil);
+    }
+
+    @Override
+    protected void addErrorLogTablePairs(DBConsumersManager manager) {
+        Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), 
null);
+        if (errorLog == null) {
+            return;
+        }
+        manager.addErrorLogTablePair(errorLog, ExtractProfiler.ERROR_TABLE);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java
new file mode 100644
index 0000000..baa7994
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.db;
+
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantReadWriteLock;
+
+
+/**
+ * Abstract buffer for map of values and unique ids.
+ * <p>
+ * Use this for fast in memory lookups of smallish sets of values.
+ *
+ */
+abstract class AbstractDBBuffer {
+
+    private final Map<String, Integer> m = new HashMap<>();
+    private final ReentrantReadWriteLock rwl = new ReentrantReadWriteLock();
+    private final Lock r = rwl.readLock();
+    private final Lock w = rwl.writeLock();
+
+    private int numWrites = 0;
+
+    public int getId(String key) {
+        r.lock();
+        try {
+            Integer v = m.get(key);
+            if (v != null) {
+                return v;
+            }
+        } finally {
+            r.unlock();
+        }
+
+        try {
+            w.lock();
+            Integer v = m.get(key);
+            if (v != null) {
+                return v;
+            }
+            v = m.size()+1;
+            m.put(key, v);
+            write(v, key);
+            numWrites++;
+            return v;
+        } finally {
+            w.unlock();
+        }
+    }
+
+    public int getNumWrites() {
+        return numWrites;
+    }
+
+    //Odd to throw RuntimeException, I know.  It should be
+    //catastrophic if this buffer can't write to the db.
+    public abstract void write(int id, String value) throws RuntimeException;
+
+    public abstract void close() throws SQLException;
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java
new file mode 100644
index 0000000..a32f874
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.db;
+
+
+import java.sql.Types;
+
+public class ColInfo {
+    private final Cols name;
+    private final int type;
+    private final Integer precision;
+    private final String constraints;
+
+    public ColInfo(Cols name, int type) {
+        this(name, type, null, null);
+    }
+
+    public ColInfo(Cols name, int type, String constraints) {
+        this(name, type, null, constraints);
+    }
+
+    public ColInfo(Cols name, int type, Integer precision) {
+        this(name, type, precision, null);
+    }
+
+
+    public ColInfo(Cols name, int type, Integer precision, String constraints) 
{
+        this.name = name;
+        this.type = type;
+        this.precision = precision;
+        this.constraints = constraints;
+    }
+
+    public int getType() {
+        return type;
+    }
+
+    public Cols getName() {
+        return name;
+    }
+    /**
+     *
+     * @return constraints string or null
+     */
+    public String getConstraints() {
+        return constraints;
+    }
+
+    /**
+     * Gets the precision.  This can be null!
+     * @return precision or null
+     */
+    public Integer getPrecision() {
+        return precision;
+    }
+
+    public String getSqlDef() {
+        if (type == Types.VARCHAR){
+            return "VARCHAR("+precision+")";
+        } else if (type == Types.CHAR) {
+            return "CHAR("+precision+")";
+        }
+        switch (type) {
+            case Types.FLOAT :
+                return "FLOAT";
+            case Types.DOUBLE :
+                return "DOUBLE";
+            case Types.BLOB :
+                return "BLOB";
+            case Types.INTEGER :
+                return "INTEGER";
+            case Types.BIGINT :
+                return "BIGINT";
+            case Types.BOOLEAN :
+                return "BOOLEAN";
+        }
+        throw new UnsupportedOperationException("Don't yet recognize a type 
for: "+type);
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+
+        ColInfo colInfo = (ColInfo) o;
+
+        if (type != colInfo.type) return false;
+        if (name != colInfo.name) return false;
+        if (precision != null ? !precision.equals(colInfo.precision) : 
colInfo.precision != null) return false;
+        return !(constraints != null ? 
!constraints.equals(colInfo.constraints) : colInfo.constraints != null);
+
+    }
+
+    @Override
+    public int hashCode() {
+        int result = name != null ? name.hashCode() : 0;
+        result = 31 * result + type;
+        result = 31 * result + (precision != null ? precision.hashCode() : 0);
+        result = 31 * result + (constraints != null ? constraints.hashCode() : 
0);
+        return result;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
new file mode 100644
index 0000000..0dff9f7
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.db;
+
+public enum Cols {
+    //container table
+    CONTAINER_ID,
+    FILE_PATH,
+    EXTRACT_FILE_LENGTH,
+
+    EXTRACT_FILE_LENGTH_A, //for comparisons
+    EXTRACT_FILE_LENGTH_B,
+
+    //profile table
+    ID,
+    LENGTH,
+    FILE_NAME,
+    FILE_EXTENSION,
+    ELAPSED_TIME_MILLIS,
+    NUM_METADATA_VALUES,
+    IS_EMBEDDED,
+    EMBEDDED_FILE_PATH,
+    MIME_TYPE_ID,
+    MD5,
+    NUM_ATTACHMENTS,
+    HAS_CONTENT,
+
+    //content
+    CONTENT_LENGTH,
+    NUM_UNIQUE_TOKENS,
+    NUM_TOKENS,
+    NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens
+    COMMON_TOKENS_LANG, //which language was used for the common tokens metric?
+    NUM_COMMON_TOKENS,
+    TOP_N_TOKENS,
+    LANG_ID_1,
+    LANG_ID_PROB_1,
+    LANG_ID_2,
+    LANG_ID_PROB_2,
+    TOKEN_ENTROPY_RATE,
+    TOKEN_LENGTH_SUM,
+    TOKEN_LENGTH_MEAN,
+    TOKEN_LENGTH_STD_DEV,
+    UNICODE_CHAR_BLOCKS,
+    NUM_PAGES, //number of pages a document alleges it has
+
+    //content comparisons
+    TOP_10_UNIQUE_TOKEN_DIFFS_A,
+    TOP_10_UNIQUE_TOKEN_DIFFS_B,
+    TOP_10_MORE_IN_A,
+    TOP_10_MORE_IN_B,
+    OVERLAP,
+    DICE_COEFFICIENT,
+
+    //errors
+    PARSE_ERROR_TYPE_ID,
+
+    PARSE_ERROR_DESCRIPTION,
+    PARSE_EXCEPTION_DESCRIPTION,
+
+    EXTRACT_ERROR_TYPE_ID,
+    EXTRACT_ERROR_DESCRIPTION,
+
+
+    //exceptions
+    ORIG_STACK_TRACE,
+    SORT_STACK_TRACE,
+    PARSE_EXCEPTION_TYPE_ID,
+
+
+    MIME_STRING,//string representation of mime type
+
+    DIR_NAME_A,//for comparisons in REF_PAIR_NAMES
+    DIR_NAME_B
+    }
+

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java
new file mode 100644
index 0000000..8d936c2
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.db;
+
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+
+public class DBBuffer extends AbstractDBBuffer {
+
+    private final PreparedStatement st;
+
+    public DBBuffer(Connection connection, String tableName,
+                    String idColumnName, String valueColumnName) throws 
SQLException {
+        st = connection.prepareStatement("insert into "+tableName+ "( "+
+                idColumnName + ", " + valueColumnName+") values (?,?);");
+    }
+
+    @Override
+    public void write(int id, String value) throws RuntimeException {
+        try {
+            st.clearParameters();
+            st.setInt(1, id);
+            st.setString(2, value);
+            st.execute();
+
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    @Override
+    public void close() throws SQLException {
+        st.close();
+
+    }
+
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java
new file mode 100644
index 0000000..1efa48a
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.db;
+
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.sql.Types;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.log4j.Logger;
+import org.apache.tika.io.IOExceptionWithCause;
+
+public abstract class DBUtil {
+
+    public static Logger logger = Logger.getLogger(DBUtil.class);
+    public abstract String getJDBCDriverClass();
+    public abstract boolean dropTableIfExists(Connection conn, String 
tableName) throws SQLException;
+    private final Path db;
+    public DBUtil(Path db) {
+        this.db = db;
+    }
+
+    /**
+     * This is intended for a file/directory based db.
+     * <p>
+     * Override this any optimizations you want to do on the db
+     * before writing/reading.
+     *
+     * @return
+     * @throws IOException
+     */
+    public Connection getConnection(boolean createIfDoesntExist) throws 
IOException {
+        String connectionString = getConnectionString(db, createIfDoesntExist);
+        Connection conn = null;
+        try {
+            try {
+                Class.forName(getJDBCDriverClass());
+            } catch (ClassNotFoundException e) {
+                throw new RuntimeException(e);
+            }
+            conn = DriverManager.getConnection(connectionString);
+            conn.setAutoCommit(false);
+        } catch (SQLException e) {
+            throw new IOExceptionWithCause(e);
+        }
+        return conn;
+    }
+
+    abstract public String getConnectionString(Path db, boolean 
createIfDoesntExist);
+
+    /**
+     *
+     * @param connection
+     * @return a list of uppercased table names
+     * @throws SQLException
+     */
+    abstract public Set<String> getTables(Connection connection) throws 
SQLException;
+
+    public static int insert(PreparedStatement insertStatement,
+                              TableInfo table,
+                              Map<Cols, String> data) throws SQLException {
+
+        //clear parameters before setting
+        insertStatement.clearParameters();
+        try {
+            int i = 1;
+            for (ColInfo colInfo : table.getColInfos()) {
+                updateInsertStatement(i, insertStatement, colInfo, 
data.get(colInfo.getName()));
+                i++;
+            }
+            for (Cols c : data.keySet()) {
+                if (! table.containsColumn(c)) {
+                    throw new IllegalArgumentException("Can't add data to "+c +
+                    " because it doesn't exist in the table: 
"+table.getName());
+                }
+            }
+            return insertStatement.executeUpdate();
+        } catch (SQLException e) {
+            logger.warn("couldn't insert data for this row: "+e.getMessage());
+            return -1;
+        }
+    }
+
+    public static void updateInsertStatement(int dbColOffset, 
PreparedStatement st,
+                                             ColInfo colInfo, String value ) 
throws SQLException {
+        if (value == null) {
+            st.setNull(dbColOffset, colInfo.getType());
+            return;
+        }
+        try {
+            switch (colInfo.getType()) {
+                case Types.VARCHAR:
+                    if (value != null && value.length() > 
colInfo.getPrecision()) {
+                        value = value.substring(0, colInfo.getPrecision());
+                        logger.warn("truncated varchar value in " + 
colInfo.getName() + " : "+value);
+                    }
+                    st.setString(dbColOffset, value);
+                    break;
+                case Types.CHAR:
+                    st.setString(dbColOffset, value);
+                    break;
+                case Types.DOUBLE:
+                    st.setDouble(dbColOffset, Double.parseDouble(value));
+                    break;
+                case Types.FLOAT:
+                    st.setDouble(dbColOffset, Float.parseFloat(value));
+                    break;
+                case Types.INTEGER:
+                    st.setDouble(dbColOffset, Integer.parseInt(value));
+                    break;
+                case Types.BIGINT:
+                    st.setLong(dbColOffset, Long.parseLong(value));
+                    break;
+                case Types.BOOLEAN:
+                    st.setBoolean(dbColOffset, Boolean.parseBoolean(value));
+                    break;
+                default:
+                    throw new UnsupportedOperationException("Don't yet support 
type: " + colInfo.getType());
+            }
+        } catch (NumberFormatException e) {
+            if (! "".equals(value)) {
+                logger.warn("number format exception: " + colInfo.getName() + 
" : " + value);
+            }
+            st.setNull(dbColOffset, colInfo.getType());
+        } catch (SQLException e) {
+            logger.warn("sqlexception: "+colInfo+ " : " + value);
+            st.setNull(dbColOffset, colInfo.getType());
+        }
+    }
+
+    public void createDB(List<TableInfo> tableInfos, boolean append) throws 
SQLException, IOException {
+        Connection conn = getConnection(true);
+        Set<String> tables = getTables(conn);
+
+        for (TableInfo tableInfo : tableInfos) {
+
+            if (append && 
tables.contains(tableInfo.getName().toUpperCase(Locale.ROOT))) {
+                continue;
+            }
+            if (! append) {
+                dropTableIfExists(conn, tableInfo.getName());
+            }
+            createTable(conn, tableInfo);
+        }
+
+        conn.commit();
+        conn.close();
+    }
+
+    private void createTable(Connection conn, TableInfo tableInfo) throws 
SQLException {
+        StringBuilder createSql = new StringBuilder();
+        createSql.append("CREATE TABLE "+tableInfo.getName());
+        createSql.append("(");
+
+        int last = 0;
+        for (ColInfo col : tableInfo.getColInfos()) {
+            last++;
+            if (last > 1) {
+                createSql.append(", ");
+            }
+            createSql.append(col.getName());
+            createSql.append(" ");
+            createSql.append(col.getSqlDef());
+            String constraints = col.getConstraints();
+            if (constraints != null) {
+                createSql.append(" ");
+                createSql.append(constraints);
+            }
+        }
+        createSql.append(")");
+        Statement st = conn.createStatement();
+        st.execute(createSql.toString());
+
+        st.close();
+        conn.commit();
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java
new file mode 100644
index 0000000..0b3ebac
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.db;
+
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.commons.io.FilenameUtils;
+
+
+public class H2Util extends DBUtil {
+
+    public H2Util(Path db) {
+        super(db);
+    }
+
+    @Override
+    public String getJDBCDriverClass() {
+        return "org.h2.Driver";
+    }
+
+    @Override
+    public boolean dropTableIfExists(Connection conn, String tableName) throws 
SQLException {
+        Statement st = conn.createStatement();
+        String sql = "drop table if exists "+tableName;
+        boolean success = st.execute(sql);
+        st.close();
+        return success;
+    }
+
+    @Override
+    public String getConnectionString(Path db, boolean createIfDoesntExist) {
+        String s = "jdbc:h2:"+ 
FilenameUtils.separatorsToUnix(db.toAbsolutePath().toString());
+        if (! createIfDoesntExist) {
+            s += ";IFEXISTS=TRUE";
+        }
+        return s;
+    }
+
+    @Override
+    public Set<String> getTables(Connection connection) throws SQLException {
+        String sql = "SHOW TABLES";
+        Statement st = connection.createStatement();
+        ResultSet rs = st.executeQuery(sql);
+        Set<String> tables = new HashSet<>();
+        while (rs.next()) {
+            String table = rs.getString(1);
+            tables.add(table);
+        }
+        return tables;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
new file mode 100644
index 0000000..11a72f8
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.db;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Types;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.AbstractProfiler;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+
+
+public class MimeBuffer extends AbstractDBBuffer {
+
+    private final PreparedStatement st;
+    private final TikaConfig config;
+
+    public MimeBuffer(Connection connection, TikaConfig config) throws 
SQLException {
+        st = connection.prepareStatement("insert into " + 
AbstractProfiler.MIME_TABLE.getName() + "( " +
+                Cols.MIME_TYPE_ID.name() + ", " +
+                Cols.MIME_STRING + ", " +
+                Cols.FILE_EXTENSION + ") values (?,?,?);");
+        this.config = config;
+    }
+
+    @Override
+    public void write(int id, String value) throws RuntimeException {
+        try {
+            st.clearParameters();
+            st.setInt(1, id);
+            st.setString(2, value);
+            try {
+                String ext = MimeUtil.getExtension(value, config);
+                if (ext == null || ext.length() == 0) {
+                    st.setNull(3, Types.VARCHAR);
+                } else {
+                    st.setString(3, ext);
+                }
+            } catch (MimeTypeException e) {
+                st.setNull(3, Types.VARCHAR);
+            }
+            st.execute();
+
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    @Override
+    public void close() throws SQLException {
+        st.close();
+    }
+
+    private static class MimeUtil {
+        //TODO: see if MimeType now works for these
+        private static final String APPLICATION = "application";
+        private static final String TEXT = "text";
+        private static final String HTML = "html";
+        private static final String XML = "xml";
+        private static final String XHTML_XML = "xhtml+xml";
+        private static final String CSS = "css";
+        private static final String CSV = "csv";
+        private static final String PLAIN = "plain";
+        private static final String EMPTY_STRING = "";
+
+        /**
+         * Utility method to convert from a string value representing a 
content type
+         * (e.g. "application/pdf") into the most common extension for that 
file type
+         * (e.g. "pdf").
+         * <p>
+         * This will has special handling for texty filetypes whose MimeTypes
+         * don't currently return anything for {@link 
org.apache.tika.mime.MimeType#getExtension};
+         *
+         * @param contentType string representing a content type, for example: 
"application/pdf"
+         * @param config      config from which to get MimeRepository
+         * @return extension or empty string
+         * @throws org.apache.tika.mime.MimeTypeException thrown if MimeTypes 
can't parse the contentType
+         */
+        public static String getExtension(String contentType, TikaConfig 
config)
+                throws MimeTypeException {
+            MimeTypes types = config.getMimeRepository();
+            MimeType mime = types.forName(contentType);
+            return getExtension(mime);
+        }
+
+        public static String getExtension(MimeType mime) {
+
+            String ext = mime.getExtension();
+            if (ext.startsWith(".")) {
+                ext = ext.substring(1);
+            }
+
+            //special handling for text/html/xml
+            if (ext.length() == 0) {
+                ext = tryTextyTypes(mime.getType());
+            }
+            return ext;
+        }
+
+        private static String tryTextyTypes(MediaType mediaType) {
+
+            String type = mediaType.getType();
+            String subtype = mediaType.getSubtype();
+            if (type.equals(TEXT)) {
+                if (subtype.equals(HTML)) {
+                    return HTML;
+                } else if (subtype.equals(PLAIN)) {
+                    return "txt";
+                } else if (subtype.equals(CSS)) {
+                    return CSS;
+                } else if (subtype.equals(CSV)) {
+                    return CSV;
+                }
+            } else if (type.equals(APPLICATION)) {
+                if (subtype.equals(XML)) {
+                    return XML;
+                } else if (subtype.equals(XHTML_XML)) {
+                    return "html";
+                }
+            }
+            return EMPTY_STRING;
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java
new file mode 100644
index 0000000..c6e3696
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.db;
+
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+public class TableInfo {
+
+    private final String name;
+    private final List<ColInfo> colInfos = new ArrayList<>();
+    private final Set<Cols> colNames = new HashSet<>();
+
+    public TableInfo(String name, ColInfo... cols) {
+        Collections.addAll(colInfos, cols);
+        Collections.unmodifiableList(colInfos);
+        this.name = name;
+        for (ColInfo c : colInfos) {
+            assert (!colNames.contains(c.getName()));
+            colNames.add(c.getName());
+        }
+    }
+
+    public TableInfo(String name, List<ColInfo> cols) {
+        colInfos.addAll(cols);
+        Collections.unmodifiableList(colInfos);
+        this.name = name;
+        for (ColInfo c : colInfos) {
+            assert (!colNames.contains(c.getName()));
+            colNames.add(c.getName());
+        }
+    }
+
+    public String getName() {
+        return name;
+    }
+
+    public List<ColInfo> getColInfos() {
+        return colInfos;
+    }
+
+    public boolean containsColumn(Cols cols) {
+        return colNames.contains(cols);
+    }
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java 
b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
new file mode 100644
index 0000000..db4cd04
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.io;
+
+import java.io.IOException;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.log4j.Logger;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.eval.db.ColInfo;
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.DBUtil;
+import org.apache.tika.eval.db.MimeBuffer;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.io.IOExceptionWithCause;
+
+/**
+ * This is still in its early stages.  The idea is to
+ * get something working with h2 and then add to that
+ * as necessary.
+ *
+ * Beware, this deletes the db file with each initialization.
+ */
+public class DBWriter implements IDBWriter {
+    
+    private static Logger logger = Logger.getLogger(DBWriter.class);
+    private final AtomicLong insertedRows = new AtomicLong();
+    private final Long commitEveryX = 1000L;
+
+    private final List<TableInfo> tableInfos;
+    private final Connection conn;
+    private final DBUtil dbUtil;
+    private static MimeBuffer mimeBuffer;
+
+    //<tableName, preparedStatement>
+    private final Map<String, PreparedStatement> inserts = new HashMap<>();
+
+    public DBWriter(List<TableInfo> tableInfos, TikaConfig tikaConfig, DBUtil 
dbUtil)
+            throws IOException, SQLException {
+
+        this.conn = dbUtil.getConnection(true);
+        if (mimeBuffer == null) {
+            mimeBuffer = new MimeBuffer(conn, tikaConfig);
+        }
+        this.tableInfos = tableInfos;
+        this.dbUtil = dbUtil;
+        for (TableInfo tableInfo : tableInfos) {
+            try {
+                PreparedStatement st = createPreparedInsert(tableInfo);
+                inserts.put(tableInfo.getName(), st);
+            } catch (SQLException e) {
+                throw new RuntimeException(e);
+            }
+        }
+    }
+
+    public int getMimeId(String mimeString) {
+        return mimeBuffer.getId(mimeString);
+    }
+
+    private PreparedStatement createPreparedInsert(TableInfo tableInfo) throws 
SQLException {
+        StringBuilder sb = new StringBuilder();
+        sb.append("INSERT INTO ").append(tableInfo.getName());
+        sb.append("(");
+        int i = 0;
+        for (ColInfo c : tableInfo.getColInfos()) {
+            if (i++ > 0) {
+                sb.append(", ");
+            }
+            sb.append(c.getName());
+        }
+        sb.append(") ");
+
+        sb.append("VALUES");
+        sb.append("(");
+        for (int j = 0; j < i; j++) {
+            if (j > 0) {
+                sb.append(", ");
+            }
+            sb.append("?");
+        }
+        sb.append(")");
+
+        return conn.prepareStatement(sb.toString());
+    }
+
+
+    public void writeRow(TableInfo table, Map<Cols, String> data) throws 
IOException {
+        try {
+            PreparedStatement p = inserts.get(table.getName());
+            if (p == null) {
+                throw new RuntimeException("Failed to create prepared 
statement for: "+
+                        table.getName());
+            }
+            dbUtil.insert(p, table, data);
+            long rows = insertedRows.incrementAndGet();
+            if (rows % commitEveryX == 0) {
+                logger.info("writer is committing after "+ rows + " rows");
+                conn.commit();
+            }
+        } catch (SQLException e) {
+            throw new IOException(e);
+        }
+    }
+
+    public void close() throws IOException {
+        try {
+            mimeBuffer.close();
+            conn.commit();
+        } catch (SQLException e){
+            e.printStackTrace();
+            throw new IOExceptionWithCause(e);
+        }
+        try {
+            conn.close();
+        } catch (SQLException e) {
+            throw new IOExceptionWithCause(e);
+        }
+
+    }
+}

Reply via email to