This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 65c318358db36e64afa477d2b90713f77ec73c4c
Author: tallison <[email protected]>
AuthorDate: Fri Oct 30 10:48:09 2020 -0400

    TIKA-3216 -- Add FileProfiler
    
    # Conflicts:
    #   tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
---
 .../apache/tika/detect/FileCommandDetector.java    |   4 +-
 .../java/org/apache/tika/eval/ExtractComparer.java |   2 +-
 .../java/org/apache/tika/eval/ExtractProfiler.java |   2 +-
 .../java/org/apache/tika/eval/FileProfiler.java    | 158 +++++++++++++++++++++
 .../java/org/apache/tika/eval/TikaEvalCLI.java     |  61 +++++++-
 .../org/apache/tika/eval/XMLErrorLogUpdater.java   |   1 -
 .../tika/eval/batch/EvalConsumerBuilder.java       |   3 +-
 .../tika/eval/batch/FileProfilerBuilder.java       |  93 ++++++++++++
 .../main/java/org/apache/tika/eval/db/Cols.java    |   3 +
 .../java/org/apache/tika/eval/db/JDBCUtil.java     |  49 ++++---
 .../java/org/apache/tika/eval/db/MimeBuffer.java   |   1 -
 .../java/org/apache/tika/eval/io/DBWriter.java     |  11 +-
 .../resources/tika-eval-file-profiler-config.xml   |  74 ++++++++++
 13 files changed, 421 insertions(+), 41 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
index b68991d..fd851c7 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
@@ -66,12 +66,12 @@ public class FileCommandDetector implements Detector {
     private int maxBytes = 1_000_000;
     private long timeoutMs = DEFAULT_TIMEOUT_MS;
 
-    static boolean checkHasFile() {
+    public static boolean checkHasFile() {
         return checkHasFile(DEFAULT_FILE_COMMAND_PATH);
     }
 
 
-    static boolean checkHasFile(String fileCommandPath) {
+    public static boolean checkHasFile(String fileCommandPath) {
         String[] commandline = new String[]{
             fileCommandPath, "-v"
         };
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java 
b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index 91568db..454a92e 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -82,7 +82,7 @@ public class ExtractComparer extends AbstractProfiler {
                 .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or 
specify via -Djdbc.driver")
                 .addOption("tablePrefixA", true, "EXPERT: optional prefix for 
table names for A")
                 .addOption("tablePrefixB", true, "EXPERT: optional prefix for 
table names for B")
-                .addOption("drop", true, "drop tables if they exist")
+                .addOption("drop", false, "drop tables if they exist")
                 .addOption("maxFilesToAdd", true, "maximum number of files to 
add to the crawler")
                 .addOption("maxTokens", true, "maximum tokens to process, 
default=200000")
                 .addOption("maxContentLength", true, "truncate content beyond 
this length for calculating 'contents' stats, default=1000000")
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java 
b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index e6e4c14..5fad576 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -66,7 +66,7 @@ public class ExtractProfiler extends AbstractProfiler {
                 .addOption("jdbc", true, "EXPERT: full jdbc connection string. 
Must specify this or -db <h2db>")
                 .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or 
specify via -Djdbc.driver")
                 .addOption("tablePrefix", true, "EXPERT: optional prefix for 
table names")
-                .addOption("drop", true, "drop tables if they exist")
+                .addOption("drop", false, "drop tables if they exist")
                 .addOption("maxFilesToAdd", true, "maximum number of files to 
add to the crawler")
                 .addOption("maxTokens", true, "maximum tokens to process, 
default=200000")
                 .addOption("maxContentLength", true, "truncate content beyond 
this length for calculating 'contents' stats, default=1000000")
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java 
b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
new file mode 100644
index 0000000..dcd1751
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval;
+
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.detect.FileCommandDetector;
+import org.apache.tika.eval.db.ColInfo;
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.eval.io.IDBWriter;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+
+/**
+ * This class profiles actual files as opposed to extracts e.g. {@link 
ExtractProfiler}.
+ * This does _not_ parse files, but does run file type identification and 
digests the
+ * raw bytes.
+ *
+ * If the 'file' command is available on the command line, this will also run 
the
+ * FileCommandDetector.
+ */
+
+public class FileProfiler extends AbstractProfiler {
+//TODO: we should allow users to select digest type/encoding and file 
detector(s).
+
+    private static final boolean HAS_FILE = FileCommandDetector.checkHasFile();
+    static Options OPTIONS;
+    static {
+
+        Option inputDir = new Option("inputDir", true,
+                "optional: directory for original binary input documents."+
+                        " If not specified, -extracts is crawled as is.");
+
+        OPTIONS = new Options()
+                .addOption(inputDir)
+                .addOption("bc", "optional: tika-batch config file")
+                .addOption("numConsumers", true, "optional: number of consumer 
threads")
+                .addOption("db", true, "db file to which to write results")
+                .addOption("jdbc", true, "EXPERT: full jdbc connection string. 
Must specify this or -db <h2db>")
+                .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or 
specify via -Djdbc.driver")
+                .addOption("tablePrefix", true, "EXPERT: optional prefix for 
table names")
+                .addOption("drop", false, "drop tables if they exist")
+                .addOption("maxFilesToAdd", true, "maximum number of files to 
add to the crawler")
+
+        ;
+
+    }
+
+    public static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(
+                80,
+                "java -jar tika-eval-x.y.jar FileProfiler -inputDir docs -db 
mydb [-inputDir input]",
+                "Tool: Profile",
+                FileProfiler.OPTIONS,
+                "Note: for the default h2 db, do not include the .mv.db at the 
end of the db name.");
+    }
+
+
+
+    public static TableInfo FILE_PROFILES = HAS_FILE ?
+            new TableInfo("file_profiles",
+                new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY 
KEY"),
+                new ColInfo(Cols.LENGTH, Types.BIGINT),
+                new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
+                new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER),
+                new ColInfo(Cols.FILE_MIME_ID, Types.INTEGER))
+            :
+            new TableInfo("file_profiles",
+                    new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY 
KEY"),
+                    new ColInfo(Cols.LENGTH, Types.BIGINT),
+                    new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
+                    new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER));
+
+
+
+    public static final String DETECT_EXCEPTION = "detect-exception";
+    private static final Tika TIKA = new Tika();
+
+    private static final FileCommandDetector FILE_COMMAND_DETECTOR = new 
FileCommandDetector();
+    private final Path inputDir;
+
+    public FileProfiler(ArrayBlockingQueue<FileResource> fileQueue, Path 
inputDir, IDBWriter dbWriter) {
+        super(fileQueue, dbWriter);
+        this.inputDir = inputDir;
+    }
+
+
+    @Override
+    public boolean processFileResource(FileResource fileResource) {
+        String relPath = 
fileResource.getMetadata().get(FSProperties.FS_REL_PATH);
+        try (InputStream is = fileResource.openInputStream()) {
+            try (TikaInputStream tis = TikaInputStream.get(is)) {
+                Path path = tis.getPath();
+                Map<Cols, String> data = new HashMap<>();
+                int tikaMimeId = writer.getMimeId(detectTika(tis));
+                data.put(Cols.FILE_PATH, relPath);
+                data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
+                data.put(Cols.LENGTH, Long.toString(Files.size(path)));
+                data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
+                if (HAS_FILE) {
+                    int fileMimeId = writer.getMimeId(detectFile(tis));
+                    data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId));
+                }
+                writer.writeRow(FILE_PROFILES, data);
+            }
+        } catch (IOException e) {
+            //log at least!
+            return false;
+        }
+        return true;
+    }
+
+    private String detectFile(TikaInputStream tis) {
+        try {
+            return FILE_COMMAND_DETECTOR.detect(tis, new 
Metadata()).toString();
+        } catch (IOException e) {
+            return DETECT_EXCEPTION;
+        }
+    }
+
+    private String detectTika(TikaInputStream tis) {
+        try {
+            return TIKA.detect(tis);
+        } catch (IOException e) {
+            return DETECT_EXCEPTION;
+        }
+    }
+}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java 
b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
index fc3f22b..14faa6b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
@@ -56,11 +56,60 @@ public class TikaEvalCLI {
             handleProfile(subsetArgs);
         } else if (tool.equals("StartDB")) {
             handleStartDB(subsetArgs);
+        } else if (tool.equals("FileProfile")){
+            handleProfileFiles(subsetArgs);
         } else {
             System.out.println(specifyTools());
         }
     }
 
+    private void handleProfileFiles(String[] subsetArgs) throws Exception {
+        List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
+
+        boolean containsBC = false;
+        String inputDir = null;
+        //confirm there's a batch-config file
+        for (int i = 0; i < argList.size(); i++) {
+            String arg = argList.get(i);
+            if (arg.equals("-bc")) {
+                containsBC = true;
+            }
+        }
+
+        Path tmpBCConfig = null;
+        try {
+            tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
+            if (! containsBC) {
+                try (InputStream is = 
this.getClass().getResourceAsStream("/tika-eval-file-profiler-config.xml")) {
+                    Files.copy(is, tmpBCConfig, 
StandardCopyOption.REPLACE_EXISTING);
+                }
+                argList.add("-bc");
+                argList.add(tmpBCConfig.toAbsolutePath().toString());
+            }
+
+            String[] updatedArgs = argList.toArray(new String[argList.size()]);
+            DefaultParser defaultCLIParser = new DefaultParser();
+            try {
+                CommandLine commandLine = 
defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs);
+                if (commandLine.hasOption("db") && 
commandLine.hasOption("jdbc")) {
+                    System.out.println("Please specify either the default -db 
or the full -jdbc, not both");
+                    ExtractProfiler.USAGE();
+                    return;
+                }
+            } catch (ParseException e) {
+                System.out.println(e.getMessage()+"\n");
+                FileProfiler.USAGE();
+                return;
+            }
+
+            FSBatchProcessCLI.main(updatedArgs);
+        } finally {
+            if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
+                Files.delete(tmpBCConfig);
+            }
+        }
+    }
+
     private void handleStartDB(String[] args) throws SQLException {
         List<String> argList = new ArrayList<>();
         argList.add("-web");
@@ -139,9 +188,9 @@ public class TikaEvalCLI {
         try {
             tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
             if (! containsBC) {
-                Files.copy(
-                        
this.getClass().getResourceAsStream("/tika-eval-profiler-config.xml"),
-                        tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
+                try (InputStream is = 
this.getClass().getResourceAsStream("/tika-eval-profiler-config.xml")) {
+                    Files.copy(is, tmpBCConfig, 
StandardCopyOption.REPLACE_EXISTING);
+                }
                 argList.add("-bc");
                 argList.add(tmpBCConfig.toAbsolutePath().toString());
             }
@@ -230,9 +279,9 @@ public class TikaEvalCLI {
         try {
             tmpBCConfig = Files.createTempFile("tika-eval", ".xml");
             if (! containsBC) {
-                Files.copy(
-                        
this.getClass().getResourceAsStream("/tika-eval-comparison-config.xml"),
-                        tmpBCConfig, StandardCopyOption.REPLACE_EXISTING);
+                try (InputStream is = 
this.getClass().getResourceAsStream("/tika-eval-comparison-config.xml")) {
+                    Files.copy(is, tmpBCConfig, 
StandardCopyOption.REPLACE_EXISTING);
+                }
                 argList.add("-bc");
                 argList.add(tmpBCConfig.toAbsolutePath().toString());
 
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java 
b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
index 18241ef..4260c1a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
@@ -63,7 +63,6 @@ public class XMLErrorLogUpdater {
         writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, 
xmlLogFileA);
         writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, 
xmlLogFileB);
         connection.commit();
-        connection.close();
     }
 
     public void update(Connection connection, TableInfo tableInfo, Path 
xmlLogFile) throws Exception {
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
index b50d4a1..6f407f6 100644
--- 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
@@ -104,7 +104,8 @@ public abstract class EvalConsumerBuilder {
 
     public void populateRefTables() throws IOException, SQLException {
         boolean refTablesPopulated = true;
-        try (Connection connection = dbUtil.getConnection()) {
+        try{
+            Connection connection = dbUtil.getConnection();
             for (TableInfo tableInfo : getRefTableInfos()) {
                 int rows = 0;
                 try (ResultSet rs = 
connection.createStatement().executeQuery("select * from "+
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
new file mode 100644
index 0000000..0ba7bea
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.batch;
+
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.eval.AbstractProfiler;
+import org.apache.tika.eval.ExtractProfiler;
+import org.apache.tika.eval.FileProfiler;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.util.PropsUtil;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+
+public class FileProfilerBuilder extends EvalConsumerBuilder {
+
+    public final static String TABLE_PREFIX_KEY = "tablePrefix";
+
+    private final List<TableInfo> tableInfos;
+    public FileProfilerBuilder() {
+        List<TableInfo> tableInfos = new ArrayList();
+        tableInfos.add(AbstractProfiler.MIME_TABLE);
+        tableInfos.add(FileProfiler.FILE_PROFILES);
+        this.tableInfos = Collections.unmodifiableList(tableInfos);
+
+    }
+
+    @Override
+    public FileResourceConsumer build() throws IOException, SQLException {
+
+        Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
+
+        //we _could_ set this to extracts (if not null)
+        //here, but the Crawler defaults to "input" if nothing is passed
+        //so this won't work
+        if (inputDir == null) {
+            throw new RuntimeException("Must specify -inputDir");
+        }
+        return parameterizeProfiler(new FileProfiler(queue, inputDir,
+                getDBWriter(tableInfos)));
+    }
+
+
+    @Override
+    protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
+        String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
+        if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
+            for (TableInfo tableInfo : tableInfos) {
+                tableInfo.setNamePrefix(tableNamePrefix);
+            }
+        }
+    }
+
+    @Override
+    protected List<TableInfo> getRefTableInfos() {
+        return Collections.EMPTY_LIST;
+    }
+
+    @Override
+    protected List<TableInfo> getNonRefTableInfos() {
+        return tableInfos;
+    }
+
+    @Override
+    protected void addErrorLogTablePairs(DBConsumersManager manager) {
+        Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), 
null);
+        if (errorLog == null) {
+            return;
+        }
+        manager.addErrorLogTablePair(errorLog, 
ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+    }
+}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index f3b212c..f0e0955 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -37,6 +37,9 @@ public enum Cols {
     IS_EMBEDDED,
     EMBEDDED_FILE_PATH,
     MIME_ID,
+    TIKA_MIME_ID,
+    FILE_MIME_ID,
+    SHA256,
     MD5,
     NUM_ATTACHMENTS,
     HAS_CONTENT,
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
index 5c3e427..33f1279 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
@@ -51,6 +51,7 @@ public class JDBCUtil {
 
     private final String connectionString;
     private String driverClass;
+    private Connection connection = null;
 
     public JDBCUtil(String connectionString, String driverClass) {
         this.connectionString = connectionString;
@@ -65,7 +66,7 @@ public class JDBCUtil {
                     Properties properties = new Properties();
                     properties.load(is);
                     for (String k : properties.stringPropertyNames()) {
-                        Matcher m = 
Pattern.compile("(?i)jdbc:"+k).matcher(connectionString);
+                        Matcher m = Pattern.compile("(?i)jdbc:" + 
k).matcher(connectionString);
                         if (m.find()) {
                             this.driverClass = properties.getProperty(k);
                         }
@@ -86,8 +87,10 @@ public class JDBCUtil {
      * @throws IOException
      */
     public Connection getConnection() throws SQLException {
+        if (connection != null) {
+            return connection;
+        }
         String connectionString = getConnectionString();
-        Connection conn = null;
         String jdbcDriver = getJDBCDriverClass();
         if (jdbcDriver != null) {
             try {
@@ -96,14 +99,15 @@ public class JDBCUtil {
                 throw new RuntimeException(e);
             }
         }
-        conn = DriverManager.getConnection(connectionString);
-        conn.setAutoCommit(false);
+        connection = DriverManager.getConnection(connectionString);
+        connection.setAutoCommit(false);
 
-        return conn;
+        return connection;
     }
 
     /**
      * JDBC driver class.  Override as necessary.
+     *
      * @return
      */
     public String getJDBCDriverClass() {
@@ -170,8 +174,8 @@ public class JDBCUtil {
     }
 
     public static void batchInsert(PreparedStatement insertStatement,
-                             TableInfo table,
-                             Map<Cols, String> data) throws SQLException {
+                                   TableInfo table,
+                                   Map<Cols, String> data) throws SQLException 
{
 
         try {
             int i = 1;
@@ -244,34 +248,35 @@ public class JDBCUtil {
 
     public void createTables(List<TableInfo> tableInfos, CREATE_TABLE 
createTable) throws SQLException, IOException {
 
-        try (Connection conn = getConnection ()) {
-            for (TableInfo tableInfo : tableInfos) {
+        Connection conn = getConnection();
+        for (TableInfo tableInfo : tableInfos) {
 
-                if (createTable.equals(CREATE_TABLE.DROP_IF_EXISTS)) {
-                    dropTableIfExists(conn, tableInfo.getName());
-                } else if (createTable.equals(CREATE_TABLE.SKIP_IF_EXISTS)) {
-                    if (containsTable(tableInfo.getName())) {
-                        continue;
-                    }
+            if (createTable.equals(CREATE_TABLE.DROP_IF_EXISTS)) {
+                dropTableIfExists(conn, tableInfo.getName());
+            } else if (createTable.equals(CREATE_TABLE.SKIP_IF_EXISTS)) {
+                if (containsTable(tableInfo.getName())) {
+                    continue;
                 }
-                createTable(conn, tableInfo);
             }
-            conn.commit();
+            createTable(conn, tableInfo);
         }
+        conn.commit();
+
     }
 
     public boolean containsTable(String tableName) throws SQLException {
-        try (Connection connection = getConnection()) {
-            Set<String> tables = getTables(connection);
-            if (tables.contains(normalizeTableName(tableName))) {
-                return true;
-            }
+        Connection connection = getConnection();
+        Set<String> tables = getTables(connection);
+        if (tables.contains(normalizeTableName(tableName))) {
+            return true;
         }
+
         return false;
     }
 
     /**
      * Override for custom behavior
+     *
      * @param tableName
      * @return
      */
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java 
b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
index 3588622..9f6b136 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
@@ -72,7 +72,6 @@ public class MimeBuffer extends AbstractDBBuffer {
     public void close() throws SQLException {
         st.close();
         connection.commit();
-        connection.close();
     }
 
     private static class MimeUtil {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java 
b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
index 909727a..19ff65b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
@@ -135,6 +135,11 @@ public class DBWriter implements IDBWriter {
         }
     }
 
+    /**
+     * This closes the writer by executing batch and
+     * committing changes.  This DOES NOT close the connection
+     * @throws IOException
+     */
     public void close() throws IOException {
         for (PreparedStatement p : inserts.values()) {
             try {
@@ -148,12 +153,6 @@ public class DBWriter implements IDBWriter {
         } catch (SQLException e){
             throw new IOExceptionWithCause(e);
         }
-        try {
-            conn.close();
-        } catch (SQLException e) {
-            throw new IOExceptionWithCause(e);
-        }
-
     }
 
     private class LastInsert {
diff --git a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml 
b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
new file mode 100644
index 0000000..6a7867a
--- /dev/null
+++ b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
@@ -0,0 +1,74 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<tika-batch-config
+        maxAliveTimeSeconds="-1"
+        pauseOnEarlyTerminationMillis="500"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="5"
+        timeoutThresholdMillis="300000">
+
+    <commandline>
+        <option opt="bc" longOpt="batch-config" hasArg="true"
+                description="xml batch config file" required="true"/>
+        <option opt="inputDir" hasArg="true"
+                description="dir to start crawling"/>
+        <option opt="numConsumers" hasArg="true"
+                description="number of fileConsumers threads"/>
+        <option opt="extracts" hasArg="true"
+                description="this dir for analysis" required="false"/>
+        <option opt="db" hasArg="true"
+                description="name of db directory or file to which to write 
results"/>
+        <option opt="jdbc" hasArg="true"
+                description="full jdbc connection string"/>
+        <option opt="jdbcDriver" hasArg="true"
+                description="canonical class name for jdbc driver"/>
+        <option opt="tablePrefix" hasArg="true"
+                description="EXPERT: prefix for table names"/>
+        <option opt="drop" hasArg="false" description="drop tables if they 
exist"/>
+        <option opt="maxFilesToAdd" hasArg="true" description="maximum number 
of files to add to the crawler"/>
+
+    </commandline>
+
+
+    <!--
+        Can also add startDir: this tells the crawler to start indexing a
+        child directory of the inputDir directory.
+    -->
+    <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+
+             crawlOrder="sorted"
+             maxConsecWaitMillis="5000"
+             maxFilesToAdd="-1"
+             maxFilesToConsider="-1"
+             includeFilePat=""
+             excludeFilePat=""
+             maxFileSizeBytes="-1"
+    />
+
+    <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder"
+               
consumerBuilderClass="org.apache.tika.eval.batch.FileProfilerBuilder"/>
+
+
+    <!-- reporter and interrupter are optional -->
+    <reporter 
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" 
sleepMillis="1000"
+              staleThresholdMillis="500000"/>
+</tika-batch-config>

Reply via email to