This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 65c318358db36e64afa477d2b90713f77ec73c4c Author: tallison <[email protected]> AuthorDate: Fri Oct 30 10:48:09 2020 -0400 TIKA-3216 -- Add FileProfiler # Conflicts: # tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java --- .../apache/tika/detect/FileCommandDetector.java | 4 +- .../java/org/apache/tika/eval/ExtractComparer.java | 2 +- .../java/org/apache/tika/eval/ExtractProfiler.java | 2 +- .../java/org/apache/tika/eval/FileProfiler.java | 158 +++++++++++++++++++++ .../java/org/apache/tika/eval/TikaEvalCLI.java | 61 +++++++- .../org/apache/tika/eval/XMLErrorLogUpdater.java | 1 - .../tika/eval/batch/EvalConsumerBuilder.java | 3 +- .../tika/eval/batch/FileProfilerBuilder.java | 93 ++++++++++++ .../main/java/org/apache/tika/eval/db/Cols.java | 3 + .../java/org/apache/tika/eval/db/JDBCUtil.java | 49 ++++--- .../java/org/apache/tika/eval/db/MimeBuffer.java | 1 - .../java/org/apache/tika/eval/io/DBWriter.java | 11 +- .../resources/tika-eval-file-profiler-config.xml | 74 ++++++++++ 13 files changed, 421 insertions(+), 41 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java index b68991d..fd851c7 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java @@ -66,12 +66,12 @@ public class FileCommandDetector implements Detector { private int maxBytes = 1_000_000; private long timeoutMs = DEFAULT_TIMEOUT_MS; - static boolean checkHasFile() { + public static boolean checkHasFile() { return checkHasFile(DEFAULT_FILE_COMMAND_PATH); } - static boolean checkHasFile(String fileCommandPath) { + public static boolean checkHasFile(String fileCommandPath) { String[] commandline = new String[]{ fileCommandPath, "-v" }; diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java index 91568db..454a92e 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java @@ -82,7 +82,7 @@ public class ExtractComparer extends AbstractProfiler { .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver") .addOption("tablePrefixA", true, "EXPERT: optional prefix for table names for A") .addOption("tablePrefixB", true, "EXPERT: optional prefix for table names for B") - .addOption("drop", true, "drop tables if they exist") + .addOption("drop", false, "drop tables if they exist") .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler") .addOption("maxTokens", true, "maximum tokens to process, default=200000") .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000") diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java index e6e4c14..5fad576 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java @@ -66,7 +66,7 @@ public class ExtractProfiler extends AbstractProfiler { .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>") .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver") .addOption("tablePrefix", true, "EXPERT: optional prefix for table names") - .addOption("drop", true, "drop tables if they exist") + .addOption("drop", false, "drop tables if they exist") .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler") .addOption("maxTokens", true, "maximum tokens to process, default=200000") .addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000") diff --git a/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java new file mode 100644 index 0000000..dcd1751 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/FileProfiler.java @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval; + +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.codec.digest.DigestUtils; +import org.apache.tika.Tika; +import org.apache.tika.batch.FileResource; +import org.apache.tika.batch.fs.FSProperties; +import org.apache.tika.detect.FileCommandDetector; +import org.apache.tika.eval.db.ColInfo; +import org.apache.tika.eval.db.Cols; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.eval.io.IDBWriter; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.Types; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ArrayBlockingQueue; + +/** + * This class profiles actual files as opposed to extracts e.g. {@link ExtractProfiler}. + * This does _not_ parse files, but does run file type identification and digests the + * raw bytes. + * + * If the 'file' command is available on the command line, this will also run the + * FileCommandDetector. + */ + +public class FileProfiler extends AbstractProfiler { +//TODO: we should allow users to select digest type/encoding and file detector(s). + + private static final boolean HAS_FILE = FileCommandDetector.checkHasFile(); + static Options OPTIONS; + static { + + Option inputDir = new Option("inputDir", true, + "optional: directory for original binary input documents."+ + " If not specified, -extracts is crawled as is."); + + OPTIONS = new Options() + .addOption(inputDir) + .addOption("bc", "optional: tika-batch config file") + .addOption("numConsumers", true, "optional: number of consumer threads") + .addOption("db", true, "db file to which to write results") + .addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>") + .addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver") + .addOption("tablePrefix", true, "EXPERT: optional prefix for table names") + .addOption("drop", false, "drop tables if they exist") + .addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler") + + ; + + } + + public static void USAGE() { + HelpFormatter helpFormatter = new HelpFormatter(); + helpFormatter.printHelp( + 80, + "java -jar tika-eval-x.y.jar FileProfiler -inputDir docs -db mydb [-inputDir input]", + "Tool: Profile", + FileProfiler.OPTIONS, + "Note: for the default h2 db, do not include the .mv.db at the end of the db name."); + } + + + + public static TableInfo FILE_PROFILES = HAS_FILE ? + new TableInfo("file_profiles", + new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"), + new ColInfo(Cols.LENGTH, Types.BIGINT), + new ColInfo(Cols.SHA256, Types.VARCHAR, 64), + new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER), + new ColInfo(Cols.FILE_MIME_ID, Types.INTEGER)) + : + new TableInfo("file_profiles", + new ColInfo(Cols.FILE_PATH, Types.VARCHAR, 2048, "PRIMARY KEY"), + new ColInfo(Cols.LENGTH, Types.BIGINT), + new ColInfo(Cols.SHA256, Types.VARCHAR, 64), + new ColInfo(Cols.TIKA_MIME_ID, Types.INTEGER)); + + + + public static final String DETECT_EXCEPTION = "detect-exception"; + private static final Tika TIKA = new Tika(); + + private static final FileCommandDetector FILE_COMMAND_DETECTOR = new FileCommandDetector(); + private final Path inputDir; + + public FileProfiler(ArrayBlockingQueue<FileResource> fileQueue, Path inputDir, IDBWriter dbWriter) { + super(fileQueue, dbWriter); + this.inputDir = inputDir; + } + + + @Override + public boolean processFileResource(FileResource fileResource) { + String relPath = fileResource.getMetadata().get(FSProperties.FS_REL_PATH); + try (InputStream is = fileResource.openInputStream()) { + try (TikaInputStream tis = TikaInputStream.get(is)) { + Path path = tis.getPath(); + Map<Cols, String> data = new HashMap<>(); + int tikaMimeId = writer.getMimeId(detectTika(tis)); + data.put(Cols.FILE_PATH, relPath); + data.put(Cols.TIKA_MIME_ID, Integer.toString(tikaMimeId)); + data.put(Cols.LENGTH, Long.toString(Files.size(path))); + data.put(Cols.SHA256, DigestUtils.sha256Hex(tis)); + if (HAS_FILE) { + int fileMimeId = writer.getMimeId(detectFile(tis)); + data.put(Cols.FILE_MIME_ID, Integer.toString(fileMimeId)); + } + writer.writeRow(FILE_PROFILES, data); + } + } catch (IOException e) { + //log at least! + return false; + } + return true; + } + + private String detectFile(TikaInputStream tis) { + try { + return FILE_COMMAND_DETECTOR.detect(tis, new Metadata()).toString(); + } catch (IOException e) { + return DETECT_EXCEPTION; + } + } + + private String detectTika(TikaInputStream tis) { + try { + return TIKA.detect(tis); + } catch (IOException e) { + return DETECT_EXCEPTION; + } + } +} diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java index fc3f22b..14faa6b 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java @@ -56,11 +56,60 @@ public class TikaEvalCLI { handleProfile(subsetArgs); } else if (tool.equals("StartDB")) { handleStartDB(subsetArgs); + } else if (tool.equals("FileProfile")){ + handleProfileFiles(subsetArgs); } else { System.out.println(specifyTools()); } } + private void handleProfileFiles(String[] subsetArgs) throws Exception { + List<String> argList = new ArrayList(Arrays.asList(subsetArgs)); + + boolean containsBC = false; + String inputDir = null; + //confirm there's a batch-config file + for (int i = 0; i < argList.size(); i++) { + String arg = argList.get(i); + if (arg.equals("-bc")) { + containsBC = true; + } + } + + Path tmpBCConfig = null; + try { + tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml"); + if (! containsBC) { + try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-file-profiler-config.xml")) { + Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING); + } + argList.add("-bc"); + argList.add(tmpBCConfig.toAbsolutePath().toString()); + } + + String[] updatedArgs = argList.toArray(new String[argList.size()]); + DefaultParser defaultCLIParser = new DefaultParser(); + try { + CommandLine commandLine = defaultCLIParser.parse(FileProfiler.OPTIONS, updatedArgs); + if (commandLine.hasOption("db") && commandLine.hasOption("jdbc")) { + System.out.println("Please specify either the default -db or the full -jdbc, not both"); + ExtractProfiler.USAGE(); + return; + } + } catch (ParseException e) { + System.out.println(e.getMessage()+"\n"); + FileProfiler.USAGE(); + return; + } + + FSBatchProcessCLI.main(updatedArgs); + } finally { + if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) { + Files.delete(tmpBCConfig); + } + } + } + private void handleStartDB(String[] args) throws SQLException { List<String> argList = new ArrayList<>(); argList.add("-web"); @@ -139,9 +188,9 @@ public class TikaEvalCLI { try { tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml"); if (! containsBC) { - Files.copy( - this.getClass().getResourceAsStream("/tika-eval-profiler-config.xml"), - tmpBCConfig, StandardCopyOption.REPLACE_EXISTING); + try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-profiler-config.xml")) { + Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING); + } argList.add("-bc"); argList.add(tmpBCConfig.toAbsolutePath().toString()); } @@ -230,9 +279,9 @@ public class TikaEvalCLI { try { tmpBCConfig = Files.createTempFile("tika-eval", ".xml"); if (! containsBC) { - Files.copy( - this.getClass().getResourceAsStream("/tika-eval-comparison-config.xml"), - tmpBCConfig, StandardCopyOption.REPLACE_EXISTING); + try (InputStream is = this.getClass().getResourceAsStream("/tika-eval-comparison-config.xml")) { + Files.copy(is, tmpBCConfig, StandardCopyOption.REPLACE_EXISTING); + } argList.add("-bc"); argList.add(tmpBCConfig.toAbsolutePath().toString()); diff --git a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java index 18241ef..4260c1a 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java @@ -63,7 +63,6 @@ public class XMLErrorLogUpdater { writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_A, xmlLogFileA); writer.update(connection, ExtractComparer.EXTRACT_EXCEPTION_TABLE_B, xmlLogFileB); connection.commit(); - connection.close(); } public void update(Connection connection, TableInfo tableInfo, Path xmlLogFile) throws Exception { diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java index b50d4a1..6f407f6 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java @@ -104,7 +104,8 @@ public abstract class EvalConsumerBuilder { public void populateRefTables() throws IOException, SQLException { boolean refTablesPopulated = true; - try (Connection connection = dbUtil.getConnection()) { + try{ + Connection connection = dbUtil.getConnection(); for (TableInfo tableInfo : getRefTableInfos()) { int rows = 0; try (ResultSet rs = connection.createStatement().executeQuery("select * from "+ diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java new file mode 100644 index 0000000..0ba7bea --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileProfilerBuilder.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.batch; + +import org.apache.tika.batch.FileResourceConsumer; +import org.apache.tika.eval.AbstractProfiler; +import org.apache.tika.eval.ExtractProfiler; +import org.apache.tika.eval.FileProfiler; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.util.PropsUtil; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; + + +public class FileProfilerBuilder extends EvalConsumerBuilder { + + public final static String TABLE_PREFIX_KEY = "tablePrefix"; + + private final List<TableInfo> tableInfos; + public FileProfilerBuilder() { + List<TableInfo> tableInfos = new ArrayList(); + tableInfos.add(AbstractProfiler.MIME_TABLE); + tableInfos.add(FileProfiler.FILE_PROFILES); + this.tableInfos = Collections.unmodifiableList(tableInfos); + + } + + @Override + public FileResourceConsumer build() throws IOException, SQLException { + + Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null); + + //we _could_ set this to extracts (if not null) + //here, but the Crawler defaults to "input" if nothing is passed + //so this won't work + if (inputDir == null) { + throw new RuntimeException("Must specify -inputDir"); + } + return parameterizeProfiler(new FileProfiler(queue, inputDir, + getDBWriter(tableInfos))); + } + + + @Override + protected void updateTableInfosWithPrefixes(Map<String, String> attrs) { + String tableNamePrefix = attrs.get(TABLE_PREFIX_KEY); + if (tableNamePrefix != null && !tableNamePrefix.equals("null")) { + for (TableInfo tableInfo : tableInfos) { + tableInfo.setNamePrefix(tableNamePrefix); + } + } + } + + @Override + protected List<TableInfo> getRefTableInfos() { + return Collections.EMPTY_LIST; + } + + @Override + protected List<TableInfo> getNonRefTableInfos() { + return tableInfos; + } + + @Override + protected void addErrorLogTablePairs(DBConsumersManager manager) { + Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null); + if (errorLog == null) { + return; + } + manager.addErrorLogTablePair(errorLog, ExtractProfiler.EXTRACT_EXCEPTION_TABLE); + } +} diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java index f3b212c..f0e0955 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java @@ -37,6 +37,9 @@ public enum Cols { IS_EMBEDDED, EMBEDDED_FILE_PATH, MIME_ID, + TIKA_MIME_ID, + FILE_MIME_ID, + SHA256, MD5, NUM_ATTACHMENTS, HAS_CONTENT, diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java b/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java index 5c3e427..33f1279 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/JDBCUtil.java @@ -51,6 +51,7 @@ public class JDBCUtil { private final String connectionString; private String driverClass; + private Connection connection = null; public JDBCUtil(String connectionString, String driverClass) { this.connectionString = connectionString; @@ -65,7 +66,7 @@ public class JDBCUtil { Properties properties = new Properties(); properties.load(is); for (String k : properties.stringPropertyNames()) { - Matcher m = Pattern.compile("(?i)jdbc:"+k).matcher(connectionString); + Matcher m = Pattern.compile("(?i)jdbc:" + k).matcher(connectionString); if (m.find()) { this.driverClass = properties.getProperty(k); } @@ -86,8 +87,10 @@ public class JDBCUtil { * @throws IOException */ public Connection getConnection() throws SQLException { + if (connection != null) { + return connection; + } String connectionString = getConnectionString(); - Connection conn = null; String jdbcDriver = getJDBCDriverClass(); if (jdbcDriver != null) { try { @@ -96,14 +99,15 @@ public class JDBCUtil { throw new RuntimeException(e); } } - conn = DriverManager.getConnection(connectionString); - conn.setAutoCommit(false); + connection = DriverManager.getConnection(connectionString); + connection.setAutoCommit(false); - return conn; + return connection; } /** * JDBC driver class. Override as necessary. + * * @return */ public String getJDBCDriverClass() { @@ -170,8 +174,8 @@ public class JDBCUtil { } public static void batchInsert(PreparedStatement insertStatement, - TableInfo table, - Map<Cols, String> data) throws SQLException { + TableInfo table, + Map<Cols, String> data) throws SQLException { try { int i = 1; @@ -244,34 +248,35 @@ public class JDBCUtil { public void createTables(List<TableInfo> tableInfos, CREATE_TABLE createTable) throws SQLException, IOException { - try (Connection conn = getConnection ()) { - for (TableInfo tableInfo : tableInfos) { + Connection conn = getConnection(); + for (TableInfo tableInfo : tableInfos) { - if (createTable.equals(CREATE_TABLE.DROP_IF_EXISTS)) { - dropTableIfExists(conn, tableInfo.getName()); - } else if (createTable.equals(CREATE_TABLE.SKIP_IF_EXISTS)) { - if (containsTable(tableInfo.getName())) { - continue; - } + if (createTable.equals(CREATE_TABLE.DROP_IF_EXISTS)) { + dropTableIfExists(conn, tableInfo.getName()); + } else if (createTable.equals(CREATE_TABLE.SKIP_IF_EXISTS)) { + if (containsTable(tableInfo.getName())) { + continue; } - createTable(conn, tableInfo); } - conn.commit(); + createTable(conn, tableInfo); } + conn.commit(); + } public boolean containsTable(String tableName) throws SQLException { - try (Connection connection = getConnection()) { - Set<String> tables = getTables(connection); - if (tables.contains(normalizeTableName(tableName))) { - return true; - } + Connection connection = getConnection(); + Set<String> tables = getTables(connection); + if (tables.contains(normalizeTableName(tableName))) { + return true; } + return false; } /** * Override for custom behavior + * * @param tableName * @return */ diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java index 3588622..9f6b136 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java @@ -72,7 +72,6 @@ public class MimeBuffer extends AbstractDBBuffer { public void close() throws SQLException { st.close(); connection.commit(); - connection.close(); } private static class MimeUtil { diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java index 909727a..19ff65b 100644 --- a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java @@ -135,6 +135,11 @@ public class DBWriter implements IDBWriter { } } + /** + * This closes the writer by executing batch and + * committing changes. This DOES NOT close the connection + * @throws IOException + */ public void close() throws IOException { for (PreparedStatement p : inserts.values()) { try { @@ -148,12 +153,6 @@ public class DBWriter implements IDBWriter { } catch (SQLException e){ throw new IOExceptionWithCause(e); } - try { - conn.close(); - } catch (SQLException e) { - throw new IOExceptionWithCause(e); - } - } private class LastInsert { diff --git a/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml new file mode 100644 index 0000000..6a7867a --- /dev/null +++ b/tika-eval/src/main/resources/tika-eval-file-profiler-config.xml @@ -0,0 +1,74 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<tika-batch-config + maxAliveTimeSeconds="-1" + pauseOnEarlyTerminationMillis="500" + timeoutCheckPulseMillis="1000" + maxQueueSize="10000" + numConsumers="5" + timeoutThresholdMillis="300000"> + + <commandline> + <option opt="bc" longOpt="batch-config" hasArg="true" + description="xml batch config file" required="true"/> + <option opt="inputDir" hasArg="true" + description="dir to start crawling"/> + <option opt="numConsumers" hasArg="true" + description="number of fileConsumers threads"/> + <option opt="extracts" hasArg="true" + description="this dir for analysis" required="false"/> + <option opt="db" hasArg="true" + description="name of db directory or file to which to write results"/> + <option opt="jdbc" hasArg="true" + description="full jdbc connection string"/> + <option opt="jdbcDriver" hasArg="true" + description="canonical class name for jdbc driver"/> + <option opt="tablePrefix" hasArg="true" + description="EXPERT: prefix for table names"/> + <option opt="drop" hasArg="false" description="drop tables if they exist"/> + <option opt="maxFilesToAdd" hasArg="true" description="maximum number of files to add to the crawler"/> + + </commandline> + + + <!-- + Can also add startDir: this tells the crawler to start indexing a + child directory of the inputDir directory. + --> + <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder" + + crawlOrder="sorted" + maxConsecWaitMillis="5000" + maxFilesToAdd="-1" + maxFilesToConsider="-1" + includeFilePat="" + excludeFilePat="" + maxFileSizeBytes="-1" + /> + + <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder" + consumerBuilderClass="org.apache.tika.eval.batch.FileProfilerBuilder"/> + + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000" + staleThresholdMillis="500000"/> +</tika-batch-config>
