This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new f0df1f5 TIKA-3216 -- Add FileProfiler
f0df1f5 is described below
commit f0df1f50a801a1816e1cc2fc1cf7a686a86f4dbb
Author: tallison <[email protected]>
AuthorDate: Fri Oct 30 10:48:09 2020 -0400
TIKA-3216 -- Add FileProfiler
---
.../apache/tika/detect/FileCommandDetector.java | 4 +-
.../java/org/apache/tika/eval/ExtractComparer.java | 2 +-
.../java/org/apache/tika/eval/ExtractProfiler.java | 2 +-
.../java/org/apache/tika/eval/FileProfiler.java | 158 +++++++++++++++++++++
.../java/org/apache/tika/eval/TikaEvalCLI.java | 50 +++++++
.../org/apache/tika/eval/XMLErrorLogUpdater.java | 1 -
.../tika/eval/batch/EvalConsumerBuilder.java | 3 +-
.../tika/eval/batch/FileProfilerBuilder.java | 93 ++++++++++++
.../main/java/org/apache/tika/eval/db/Cols.java | 3 +
.../java/org/apache/tika/eval/db/JDBCUtil.java | 49 ++++---
.../java/org/apache/tika/eval/db/MimeBuffer.java | 1 -
.../java/org/apache/tika/eval/io/DBWriter.java | 11 +-
.../resources/tika-eval-file-profiler-config.xml | 74 ++++++++++
13 files changed, 416 insertions(+), 35 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
index b68991d..fd851c7 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
@@ -66,12 +66,12 @@ public class FileCommandDetector implements Detector {
private int maxBytes = 1_000_000;
private long timeoutMs = DEFAULT_TIMEOUT_MS;
- static boolean checkHasFile() {
+ public static boolean checkHasFile() {
return checkHasFile(DEFAULT_FILE_COMMAND_PATH);
}
- static boolean checkHasFile(String fileCommandPath) {
+ public static boolean checkHasFile(String fileCommandPath) {
String[] commandline = new String[]{
fileCommandPath, "-v"
};
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
index 8eca1c9..79df621 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
@@ -82,7 +82,7 @@ public class ExtractComparer extends AbstractProfiler {
.addOption("jdbcDriver", true, "EXPERT: jdbc driver, or
specify via -Djdbc.driver")
.addOption("tablePrefixA", true, "EXPERT: optional prefix for
table names for A")
.addOption("tablePrefixB", true, "EXPERT: optional prefix for
table names for B")
- .addOption("drop", true, "drop tables if they exist")
+ .addOption("drop", false, "drop tables if they exist")
.addOption("maxFilesToAdd", true, "maximum number of files to
add to the crawler")
.addOption("maxTokens", true, "maximum tokens to process,
default=200000")
.addOption("maxContentLength", true, "truncate content beyond
this length for calculating 'contents' stats, default=1000000")
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
index e6e4c14..5fad576 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
@@ -66,7 +66,7 @@ public class ExtractProfiler extends AbstractProfiler {
.addOption("jdbc", true, "EXPERT: full jdbc connection string.
Must specify this or -db <h2db>")
.addOption("jdbcDriver", true, "EXPERT: jdbc driver, or
specify via -Djdbc.driver")
.addOption("tablePrefix", true, "EXPERT: optional prefix for
table names")
- .addOption("drop", true, "drop tables if they exist")
+ .addOption("drop", false, "drop tables if they exist")
.addOption("maxFilesToAdd", true, "maximum number of files to
add to the crawler")
.addOption("maxTokens", true, "maximum tokens to process,
default=200000")
.addOption("maxContentLength", true, "truncate content beyond
this length for calculating 'contents' stats, default=1000000")
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
new file mode 100644
index 0000000..dcd1751
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval;
+
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.batch.FileResource;
+import org.apache.tika.batch.fs.FSProperties;
+import org.apache.tika.detect.FileCommandDetector;
+import org.apache.tika.eval.db.ColInfo;
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.eval.io.IDBWriter;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ArrayBlockingQueue;
+
+/**
+ * This class profiles actual files as opposed to extracts e.g. {@link
ExtractProfiler}.
+ * This does _not_ parse files, but does run file type identification and
digests the
+ * raw bytes.
+ *
+ * If the 'file' command is available on the command line, this will also run
the
+ * FileCommandDetector.
+ */
+
+public class FileProfiler extends AbstractProfiler {
+//TODO: we should allow users to select digest type/encoding and file
detector(s).
+
+ private static final boolean HAS_FILE = FileCommandDetector.checkHasFile();
+ static Options OPTIONS;
+ static {
+
+ Option inputDir = new Option("inputDir", true,
+ "optional: directory for original binary input documents."+
+ " If not specified, -extracts is crawled as is.");
+
+ OPTIONS = new Options()
+ .addOption(inputDir)
+ .addOption("bc", "optional: tika-batch config file")
+ .addOption("numConsumers", true, "optional: number of consumer
threads")
+ .addOption("db", true, "db file to which to write results")
+ .addOption("jdbc", true, "EXPERT: full jdbc connection string.
Must specify this or -db <h2db>")
+ .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or
specify via -Djdbc.driver")
+ .addOption("tablePrefix", true, "EXPERT: optional prefix for
table names")
+ .addOption("drop", false, "drop tables if they exist")
+ .addOption("maxFilesToAdd", true, "maximum number of files to
add to the crawler")
+
+ ;
+
+ }
+
+ public static void USAGE() {
+ HelpFormatter helpFormatter = new HelpFormatter();
+ helpFormatter.printHelp(
+ 80,
+ "java -jar tika-eval-x.y.jar FileProfiler -inputDir docs -db
mydb [-inputDir input]",
+ "Tool: Profile",
+ FileProfiler.OPTIONS,
+ "Note: for the default h2 db, do not include the .mv.db at the
end of the db name.");
+ }
+
+
+
+ public static TableInfo FILE_PROFILES = HAS_FILE ?
+ new TableInfo("file_profiles",
+ new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY
KEY"),
+ new ColInfo(Cols.LENGTH, Types.BIGINT),
+ new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
+ new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER),
+ new ColInfo(Cols.FILE_MIME_ID, Types.INTEGER))
+ :
+ new TableInfo("file_profiles",
+ new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY
KEY"),
+ new ColInfo(Cols.LENGTH, Types.BIGINT),
+ new ColInfo(Cols.SHA256, Types.VARCHAR, 64),
+ new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER));
+
+
+
+ public static final String DETECT_EXCEPTION = "detect-exception";
+ private static final Tika TIKA = new Tika();
+
+ private static final FileCommandDetector FILE_COMMAND_DETECTOR = new
FileCommandDetector();
+ private final Path inputDir;
+
+ public FileProfiler(ArrayBlockingQueue<FileResource> fileQueue, Path
inputDir, IDBWriter dbWriter) {
+ super(fileQueue, dbWriter);
+ this.inputDir = inputDir;
+ }
+
+
+ @Override
+ public boolean processFileResource(FileResource fileResource) {
+ String relPath =
fileResource.getMetadata().get(FSProperties.FS_REL_PATH);
+ try (InputStream is = fileResource.openInputStream()) {
+ try (TikaInputStream tis = TikaInputStream.get(is)) {
+ Path path = tis.getPath();
+ Map<Cols, String> data = new HashMap<>();
+ int tikaMimeId = writer.getMimeId(detectTika(tis));
+ data.put(Cols.FILE_PATH, relPath);
+ data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId));
+ data.put(Cols.LENGTH, Long.toString(Files.size(path)));
+ data.put(Cols.SHA256, DigestUtils.sha256Hex(tis));
+ if (HAS_FILE) {
+ int fileMimeId = writer.getMimeId(detectFile(tis));
+ data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId));
+ }
+ writer.writeRow(FILE_PROFILES, data);
+ }
+ } catch (IOException e) {
+ //log at least!
+ return false;
+ }
+ return true;
+ }
+
+ private String detectFile(TikaInputStream tis) {
+ try {
+ return FILE_COMMAND_DETECTOR.detect(tis, new
Metadata()).toString();
+ } catch (IOException e) {
+ return DETECT_EXCEPTION;
+ }
+ }
+
+ private String detectTika(TikaInputStream tis) {
+ try {
+ return TIKA.detect(tis);
+ } catch (IOException e) {
+ return DETECT_EXCEPTION;
+ }
+ }
+}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
index 4baa3b0..7692b57 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
@@ -16,6 +16,7 @@
*/
package org.apache.tika.eval;
+import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -57,11 +58,60 @@ public class TikaEvalCLI {
handleProfile(subsetArgs);
} else if (tool.equals("StartDB")) {
handleStartDB(subsetArgs);
+ } else if (tool.equals("FileProfile")){
+ handleProfileFiles(subsetArgs);
} else {
System.out.println(specifyTools());
}
}
+ private void handleProfileFiles(String[] subsetArgs) throws Exception {
+ List<String> argList = new ArrayList(Arrays.asList(subsetArgs));
+
+ boolean containsBC = false;
+ String inputDir = null;
+ //confirm there's a batch-config file
+ for (int i = 0; i < argList.size(); i++) {
+ String arg = argList.get(i);
+ if (arg.equals("-bc")) {
+ containsBC = true;
+ }
+ }
+
+ Path tmpBCConfig = null;
+ try {
+ tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml");
+ if (! containsBC) {
+ try (InputStream is =
this.getClass().getResourceAsStream("/tika-eval-file-profiler-config.xml")) {
+ Files.copy(is, tmpBCConfig,
StandardCopyOption.REPLACE_EXISTING);
+ }
+ argList.add("-bc");
+ argList.add(tmpBCConfig.toAbsolutePath().toString());
+ }
+
+ String[] updatedArgs = argList.toArray(new String[argList.size()]);
+ DefaultParser defaultCLIParser = new DefaultParser();
+ try {
+ CommandLine commandLine =
defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs);
+ if (commandLine.hasOption("db") &&
commandLine.hasOption("jdbc")) {
+ System.out.println("Please specify either the default -db
or the full -jdbc, not both");
+ ExtractProfiler.USAGE();
+ return;
+ }
+ } catch (ParseException e) {
+ System.out.println(e.getMessage()+"\n");
+ FileProfiler.USAGE();
+ return;
+ }
+
+ FSBatchProcessCLI.main(updatedArgs);
+ } finally {
+ if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) {
+ Files.delete(tmpBCConfig);
+ }
+ }
+ }
+
private void handleStartDB(String[] args) throws SQLException {
List<String> argList = new ArrayList<>();
argList.add("-web");
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
index 18241ef..4260c1a 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
@@ -63,7 +63,6 @@ public class XMLErrorLogUpdater {
writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A,
xmlLogFileA);
writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B,
xmlLogFileB);
connection.commit();
- connection.close();
}
public void update(Connection connection, TableInfo tableInfo, Path
xmlLogFile) throws Exception {
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
index b50d4a1..6f407f6 100644
---
a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
+++
b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
@@ -104,7 +104,8 @@ public abstract class EvalConsumerBuilder {
public void populateRefTables() throws IOException, SQLException {
boolean refTablesPopulated = true;
- try (Connection connection = dbUtil.getConnection()) {
+ try{
+ Connection connection = dbUtil.getConnection();
for (TableInfo tableInfo : getRefTableInfos()) {
int rows = 0;
try (ResultSet rs =
connection.createStatement().executeQuery("select * from "+
diff --git
a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
new file mode 100644
index 0000000..0ba7bea
--- /dev/null
+++
b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.batch;
+
+import org.apache.tika.batch.FileResourceConsumer;
+import org.apache.tika.eval.AbstractProfiler;
+import org.apache.tika.eval.ExtractProfiler;
+import org.apache.tika.eval.FileProfiler;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.util.PropsUtil;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+
+
+public class FileProfilerBuilder extends EvalConsumerBuilder {
+
+ public final static String TABLE_PREFIX_KEY = "tablePrefix";
+
+ private final List<TableInfo> tableInfos;
+ public FileProfilerBuilder() {
+ List<TableInfo> tableInfos = new ArrayList();
+ tableInfos.add(AbstractProfiler.MIME_TABLE);
+ tableInfos.add(FileProfiler.FILE_PROFILES);
+ this.tableInfos = Collections.unmodifiableList(tableInfos);
+
+ }
+
+ @Override
+ public FileResourceConsumer build() throws IOException, SQLException {
+
+ Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null);
+
+ //we _could_ set this to extracts (if not null)
+ //here, but the Crawler defaults to "input" if nothing is passed
+ //so this won't work
+ if (inputDir == null) {
+ throw new RuntimeException("Must specify -inputDir");
+ }
+ return parameterizeProfiler(new FileProfiler(queue, inputDir,
+ getDBWriter(tableInfos)));
+ }
+
+
+ @Override
+ protected void updateTableInfosWithPrefixes(Map<String, String> attrs) {
+ String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY);
+ if (tableNamePrefix != null && !tableNamePrefix.equals("null")) {
+ for (TableInfo tableInfo : tableInfos) {
+ tableInfo.setNamePrefix(tableNamePrefix);
+ }
+ }
+ }
+
+ @Override
+ protected List<TableInfo> getRefTableInfos() {
+ return Collections.EMPTY_LIST;
+ }
+
+ @Override
+ protected List<TableInfo> getNonRefTableInfos() {
+ return tableInfos;
+ }
+
+ @Override
+ protected void addErrorLogTablePairs(DBConsumersManager manager) {
+ Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"),
null);
+ if (errorLog == null) {
+ return;
+ }
+ manager.addErrorLogTablePair(errorLog,
ExtractProfiler.EXTRACT_EXCEPTION_TABLE);
+ }
+}
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
index f3b212c..f0e0955 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
@@ -37,6 +37,9 @@ public enum Cols {
IS_EMBEDDED,
EMBEDDED_FILE_PATH,
MIME_ID,
+ TIKA_MIME_ID,
+ FILE_MIME_ID,
+ SHA256,
MD5,
NUM_ATTACHMENTS,
HAS_CONTENT,
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
b/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
index 5c3e427..33f1279 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java
@@ -51,6 +51,7 @@ public class JDBCUtil {
private final String connectionString;
private String driverClass;
+ private Connection connection = null;
public JDBCUtil(String connectionString, String driverClass) {
this.connectionString = connectionString;
@@ -65,7 +66,7 @@ public class JDBCUtil {
Properties properties = new Properties();
properties.load(is);
for (String k : properties.stringPropertyNames()) {
- Matcher m =
Pattern.compile("(?i)jdbc:"+k).matcher(connectionString);
+ Matcher m = Pattern.compile("(?i)jdbc:" +
k).matcher(connectionString);
if (m.find()) {
this.driverClass = properties.getProperty(k);
}
@@ -86,8 +87,10 @@ public class JDBCUtil {
* @throws IOException
*/
public Connection getConnection() throws SQLException {
+ if (connection != null) {
+ return connection;
+ }
String connectionString = getConnectionString();
- Connection conn = null;
String jdbcDriver = getJDBCDriverClass();
if (jdbcDriver != null) {
try {
@@ -96,14 +99,15 @@ public class JDBCUtil {
throw new RuntimeException(e);
}
}
- conn = DriverManager.getConnection(connectionString);
- conn.setAutoCommit(false);
+ connection = DriverManager.getConnection(connectionString);
+ connection.setAutoCommit(false);
- return conn;
+ return connection;
}
/**
* JDBC driver class. Override as necessary.
+ *
* @return
*/
public String getJDBCDriverClass() {
@@ -170,8 +174,8 @@ public class JDBCUtil {
}
public static void batchInsert(PreparedStatement insertStatement,
- TableInfo table,
- Map<Cols, String> data) throws SQLException {
+ TableInfo table,
+ Map<Cols, String> data) throws SQLException
{
try {
int i = 1;
@@ -244,34 +248,35 @@ public class JDBCUtil {
public void createTables(List<TableInfo> tableInfos, CREATE_TABLE
createTable) throws SQLException, IOException {
- try (Connection conn = getConnection ()) {
- for (TableInfo tableInfo : tableInfos) {
+ Connection conn = getConnection();
+ for (TableInfo tableInfo : tableInfos) {
- if (createTable.equals(CREATE_TABLE.DROP_IF_EXISTS)) {
- dropTableIfExists(conn, tableInfo.getName());
- } else if (createTable.equals(CREATE_TABLE.SKIP_IF_EXISTS)) {
- if (containsTable(tableInfo.getName())) {
- continue;
- }
+ if (createTable.equals(CREATE_TABLE.DROP_IF_EXISTS)) {
+ dropTableIfExists(conn, tableInfo.getName());
+ } else if (createTable.equals(CREATE_TABLE.SKIP_IF_EXISTS)) {
+ if (containsTable(tableInfo.getName())) {
+ continue;
}
- createTable(conn, tableInfo);
}
- conn.commit();
+ createTable(conn, tableInfo);
}
+ conn.commit();
+
}
public boolean containsTable(String tableName) throws SQLException {
- try (Connection connection = getConnection()) {
- Set<String> tables = getTables(connection);
- if (tables.contains(normalizeTableName(tableName))) {
- return true;
- }
+ Connection connection = getConnection();
+ Set<String> tables = getTables(connection);
+ if (tables.contains(normalizeTableName(tableName))) {
+ return true;
}
+
return false;
}
/**
* Override for custom behavior
+ *
* @param tableName
* @return
*/
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
index 3588622..9f6b136 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
@@ -72,7 +72,6 @@ public class MimeBuffer extends AbstractDBBuffer {
public void close() throws SQLException {
st.close();
connection.commit();
- connection.close();
}
private static class MimeUtil {
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
index 909727a..19ff65b 100644
--- a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
@@ -135,6 +135,11 @@ public class DBWriter implements IDBWriter {
}
}
+ /**
+ * This closes the writer by executing batch and
+ * committing changes. This DOES NOT close the connection
+ * @throws IOException
+ */
public void close() throws IOException {
for (PreparedStatement p : inserts.values()) {
try {
@@ -148,12 +153,6 @@ public class DBWriter implements IDBWriter {
} catch (SQLException e){
throw new IOExceptionWithCause(e);
}
- try {
- conn.close();
- } catch (SQLException e) {
- throw new IOExceptionWithCause(e);
- }
-
}
private class LastInsert {
diff --git a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
new file mode 100644
index 0000000..6a7867a
--- /dev/null
+++ b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml
@@ -0,0 +1,74 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<tika-batch-config
+ maxAliveTimeSeconds="-1"
+ pauseOnEarlyTerminationMillis="500"
+ timeoutCheckPulseMillis="1000"
+ maxQueueSize="10000"
+ numConsumers="5"
+ timeoutThresholdMillis="300000">
+
+ <commandline>
+ <option opt="bc" longOpt="batch-config" hasArg="true"
+ description="xml batch config file" required="true"/>
+ <option opt="inputDir" hasArg="true"
+ description="dir to start crawling"/>
+ <option opt="numConsumers" hasArg="true"
+ description="number of fileConsumers threads"/>
+ <option opt="extracts" hasArg="true"
+ description="this dir for analysis" required="false"/>
+ <option opt="db" hasArg="true"
+ description="name of db directory or file to which to write
results"/>
+ <option opt="jdbc" hasArg="true"
+ description="full jdbc connection string"/>
+ <option opt="jdbcDriver" hasArg="true"
+ description="canonical class name for jdbc driver"/>
+ <option opt="tablePrefix" hasArg="true"
+ description="EXPERT: prefix for table names"/>
+ <option opt="drop" hasArg="false" description="drop tables if they
exist"/>
+ <option opt="maxFilesToAdd" hasArg="true" description="maximum number
of files to add to the crawler"/>
+
+ </commandline>
+
+
+ <!--
+ Can also add startDir: this tells the crawler to start indexing a
+ child directory of the inputDir directory.
+ -->
+ <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+
+ crawlOrder="sorted"
+ maxConsecWaitMillis="5000"
+ maxFilesToAdd="-1"
+ maxFilesToConsider="-1"
+ includeFilePat=""
+ excludeFilePat=""
+ maxFileSizeBytes="-1"
+ />
+
+ <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder"
+
consumerBuilderClass="org.apache.tika.eval.batch.FileProfilerBuilder"/>
+
+
+ <!-- reporter and interrupter are optional -->
+ <reporter
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder"
sleepMillis="1000"
+ staleThresholdMillis="500000"/>
+</tika-batch-config>