http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java new file mode 100644 index 0000000..5860327 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java @@ -0,0 +1,262 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.ParseException; +import org.apache.tika.batch.fs.FSBatchProcessCLI; +import org.apache.tika.eval.reports.ResultsReporter; +import org.h2.tools.Console; + +public class TikaEvalCLI { + static final String[] tools = {"Profile", "Compare", "Report", "StartDB"}; + + private static String specifyTools() { + StringBuilder sb = new StringBuilder(); + sb.append("Must specify one of the following tools in the first parameter:\n"); + for (String s : tools) { + sb.append(s+"\n"); + } + return sb.toString(); + + } + + private void execute(String[] args) throws Exception { + String tool = args[0]; + String[] subsetArgs = new String[args.length-1]; + System.arraycopy(args, 1, subsetArgs, 0, args.length - 1); + if (tool.equals("Report")) { + handleReport(subsetArgs); + } else if (tool.equals("Compare")) { + handleCompare(subsetArgs); + } else if (tool.equals("Profile")) { + handleProfile(subsetArgs); + } else if (tool.equals("StartDB")) { + handleStartDB(subsetArgs); + } else { + System.out.println(specifyTools()); + } + } + + private void handleStartDB(String[] args) throws SQLException { + List<String> argList = new ArrayList<>(); + argList.add("-web"); + Console.main(argList.toArray(new String[argList.size()])); + while(true) { + try { + Thread.sleep(1000); + } catch (InterruptedException e){ + break; + } + } + } + + private void handleProfile(String[] subsetArgs) throws Exception { + List<String> argList = new ArrayList(Arrays.asList(subsetArgs)); + + boolean containsBC = false; + String inputDir = null; + String extractDir = null; + String alterExtract = null; + //confirm there's a batch-config file + for (int i = 0; i < argList.size(); i++) { + String arg = argList.get(i); + if (arg.equals("-bc")) { + containsBC = true; + } else if (arg.equals("-inputDir")) { + if (i+1 >= argList.size()) { + System.err.println("Must specify directory after -inputDir"); + ExtractProfiler.USAGE(); + return; + } + inputDir = argList.get(i+1); + i++; + } else if (arg.equals("-extractDir")) { + if (i+1 >= argList.size()) { + System.err.println("Must specify directory after -extractDir"); + ExtractProfiler.USAGE(); + return; + } + extractDir = argList.get(i+1); + i++; + } else if (arg.equals("-alterExtract")) { + if (i+1 >= argList.size()) { + System.err.println("Must specify directory after -extractsB"); + ExtractComparer.USAGE(); + return; + } + alterExtract = argList.get(i+1); + i++; + } + } + + if (alterExtract != null && !alterExtract.equals("as_is") && + !alterExtract.equals("concatenate_content") && + !alterExtract.equals("first_only")) { + System.out.println("Sorry, I don't understand:"+alterExtract+ + ". The values must be one of: as_is, first_only, concatenate_content"); + ExtractProfiler.USAGE(); + return; + } + + //need to specify each in this commandline + //if only extractDir is passed to tika-batch, + //the crawler will see no inputDir and start crawling "input". + //this allows the user to specify either extractDir or inputDir + if (extractDir == null && inputDir != null) { + argList.add("-extractDir"); + argList.add(inputDir); + } else if (inputDir == null && extractDir != null) { + argList.add("-inputDir"); + argList.add(extractDir); + } + + Path tmpBCConfig = null; + try { + tmpBCConfig = Files.createTempFile("tika-eval-profiler", ".xml"); + if (! containsBC) { + Files.copy( + this.getClass().getResourceAsStream("/tika-eval-profiler-config.xml"), + tmpBCConfig, StandardCopyOption.REPLACE_EXISTING); + argList.add("-bc"); + argList.add(tmpBCConfig.toAbsolutePath().toString()); + } + + String[] updatedArgs = argList.toArray(new String[argList.size()]); + DefaultParser defaultCLIParser = new DefaultParser(); + try { + defaultCLIParser.parse(ExtractProfiler.OPTIONS, updatedArgs); + } catch (ParseException e) { + System.out.println(e.getMessage()+"\n"); + ExtractProfiler.USAGE(); + return; + } + + FSBatchProcessCLI.main(updatedArgs); + } finally { + if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) { + Files.delete(tmpBCConfig); + } + } + } + + private void handleCompare(String[] subsetArgs) throws Exception{ + List<String> argList = new ArrayList(Arrays.asList(subsetArgs)); + + boolean containsBC = false; + String inputDir = null; + String extractsA = null; + String alterExtract = null; + //confirm there's a batch-config file + for (int i = 0; i < argList.size(); i++) { + String arg = argList.get(i); + if (arg.equals("-bc")) { + containsBC = true; + } else if (arg.equals("-inputDir")) { + if (i+1 >= argList.size()) { + System.err.println("Must specify directory after -inputDir"); + ExtractComparer.USAGE(); + return; + } + inputDir = argList.get(i+1); + i++; + } else if (arg.equals("-extractsA")) { + if (i+1 >= argList.size()) { + System.err.println("Must specify directory after -extractsA"); + ExtractComparer.USAGE(); + return; + } + extractsA = argList.get(i+1); + i++; + } else if (arg.equals("-alterExtract")) { + if (i+1 >= argList.size()) { + System.err.println("Must specify directory after -extractsB"); + ExtractComparer.USAGE(); + return; + } + alterExtract = argList.get(i+1); + i++; + } + } + if (alterExtract != null && !alterExtract.equals("as_is") && + !alterExtract.equals("concatenate_content") && + !alterExtract.equals("first_only")) { + System.out.println("Sorry, I don't understand:"+alterExtract+ + ". The values must be one of: as_is, first_only, concatenate_content"); + ExtractComparer.USAGE(); + return; + } + + //need to specify each in the commandline that goes into tika-batch + //if only extractDir is passed to tika-batch, + //the crawler will see no inputDir and start crawling "input". + //if the user doesn't specify inputDir, crawl extractsA + if (inputDir == null && extractsA != null) { + argList.add("-inputDir"); + argList.add(extractsA); + } + + Path tmpBCConfig = null; + try { + tmpBCConfig = Files.createTempFile("tika-eval", ".xml"); + if (! containsBC) { + Files.copy( + this.getClass().getResourceAsStream("/tika-eval-comparison-config.xml"), + tmpBCConfig, StandardCopyOption.REPLACE_EXISTING); + argList.add("-bc"); + argList.add(tmpBCConfig.toAbsolutePath().toString()); + + } + String[] updatedArgs = argList.toArray(new String[argList.size()]); + DefaultParser defaultCLIParser = new DefaultParser(); + try { + defaultCLIParser.parse(ExtractComparer.OPTIONS, updatedArgs); + } catch (ParseException e) { + System.out.println(e.getMessage()+"\n"); + ExtractComparer.USAGE(); + return; + } + + FSBatchProcessCLI.main(updatedArgs); + } finally { + if (tmpBCConfig != null && Files.isRegularFile(tmpBCConfig)) { + Files.delete(tmpBCConfig); + } + } + } + + private void handleReport(String[] subsetArgs) throws Exception { + ResultsReporter.main(subsetArgs); + } + + public static void main(String[] args) throws Exception { + TikaEvalCLI cli = new TikaEvalCLI(); + if (args.length == 0) { + System.err.println(specifyTools()); + return; + } + cli.execute(args); + } +}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java new file mode 100644 index 0000000..9a7e7aa --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java @@ -0,0 +1,226 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval; + + +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamConstants; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; + +import org.apache.log4j.Level; +import org.apache.tika.eval.db.Cols; +import org.apache.tika.eval.db.DBUtil; +import org.apache.tika.eval.db.H2Util; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.eval.io.XMLLogMsgHandler; +import org.apache.tika.eval.io.XMLLogReader; +import org.apache.tika.eval.reports.ResultsReporter; +import org.apache.tika.io.IOExceptionWithCause; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This is a very task specific class that reads a log file and updates + * the "comparisons" table. It should not be run in a multithreaded environment. + */ +public class XMLErrorLogUpdater { + + protected static Logger LOGGER = LoggerFactory.getLogger(ResultsReporter.class); + + private Statement statement; + + public static void main(String[] args) throws Exception { + + XMLErrorLogUpdater writer = new XMLErrorLogUpdater(); + Path xmlLogFileA = Paths.get(args[0]); + Path xmlLogFileB = Paths.get(args[1]); + Path db = Paths.get(args[2]); + DBUtil dbUtil = new H2Util(db); + Connection connection = dbUtil.getConnection(true); + writer.update(connection, ExtractComparer.ERROR_TABLE_A, xmlLogFileA); + writer.update(connection, ExtractComparer.ERROR_TABLE_B, xmlLogFileB); + connection.commit(); + connection.close(); + } + + public void update(Connection connection, TableInfo tableInfo, Path xmlLogFile) throws Exception { + statement = connection.createStatement(); + XMLLogReader reader = new XMLLogReader(); + try (InputStream is = Files.newInputStream(xmlLogFile)) { + reader.read(is, new ErrorMsgUpdater(tableInfo.getName())); + } catch (IOException e) { + throw new RuntimeException("Problem reading: "+xmlLogFile.toAbsolutePath().toString()); + } finally { + try { + connection.commit(); + statement.close(); + } catch (SQLException e) { + throw new RuntimeException("Failed to close db connection!", e); + } + } + } + + private class ErrorMsgUpdater implements XMLLogMsgHandler { + private final String errorTablename; + + private ErrorMsgUpdater(String errorTablename) { + this.errorTablename = errorTablename; + } + + @Override + public void handleMsg(Level level, String xml) throws SQLException, IOException { + if (! level.equals(Level.ERROR)) { + return; + } + XMLStreamReader reader = null; + try { + reader = XMLInputFactory.newInstance().createXMLStreamReader(new StringReader(xml)); + } catch (XMLStreamException e) { + throw new IOExceptionWithCause(e); + } + String type = null; + String resourceId = null; + try { + while (reader.hasNext() && type == null && resourceId == null) { + reader.next(); + switch (reader.getEventType()) { + case XMLStreamConstants.START_ELEMENT: + if ("timed_out".equals(reader.getLocalName())) { + resourceId = reader.getAttributeValue("", "resourceId"); + update(errorTablename, resourceId, + AbstractProfiler.PARSE_ERROR_TYPE.TIMEOUT); + + } else if ("oom".equals(reader.getLocalName())) { + resourceId = reader.getAttributeValue("", "resourceId"); + update(errorTablename, resourceId, AbstractProfiler.PARSE_ERROR_TYPE.OOM); + } + break; + } + } + reader.close(); + } catch (XMLStreamException e) { + throw new IOExceptionWithCause(e); + } + } + + private void update(String errorTableName, + String filePath, AbstractProfiler.PARSE_ERROR_TYPE type) throws SQLException { + int containerId = getContainerId(filePath); + String sql = "SELECT count(1) from "+errorTableName + + " where "+Cols.CONTAINER_ID + + " = "+containerId + " or "+ + Cols.FILE_PATH + "='"+filePath+"'"; + ResultSet rs = statement.executeQuery(sql); + + //now try to figure out if that file already exists + //in parse errors + int hitCount = 0; + while (rs.next()) { + hitCount = rs.getInt(1); + } + + //if it does, update all records matching that path or container id + if (hitCount > 0) { + sql = "UPDATE " + errorTableName + + " SET " + Cols.PARSE_ERROR_TYPE_ID + + " = " + type.ordinal() + ","+ + Cols.FILE_PATH + "='" +filePath+"'"+ + " where "+Cols.CONTAINER_ID + + "="+containerId + " or "+ + Cols.FILE_PATH + "='"+filePath+"'";; + + } else { + //if not and container id > -1 + //insert full record + if (containerId > -1) { + sql = "INSERT INTO " + errorTableName + + " ("+Cols.CONTAINER_ID+","+Cols.FILE_PATH +","+Cols.PARSE_ERROR_TYPE_ID+")"+ + " values (" + containerId + ", '" + filePath + "'," + + type.ordinal() + ");"; + } else { + //if container id == -1, insert only file path and parse error type id + sql = "INSERT INTO " + errorTableName + + " ("+Cols.FILE_PATH.name()+","+Cols.PARSE_ERROR_TYPE_ID+")"+ + "values ('" + filePath + "'," + + type.ordinal() + ");"; + } + + } + int updated = statement.executeUpdate(sql); + if (updated == 0) { + //TODO: log + LOGGER.warn("made no updates in xmlerrorlogupdater!"); + } else if (updated > 1) { + LOGGER.warn("made too many updates"); + } + } + + private int getContainerId(String resourceId) throws SQLException { + int containerId = -1; + String sql = "SELECT " + Cols.CONTAINER_ID.name() + + " from " + ExtractProfiler.CONTAINER_TABLE.getName()+ + " where " + Cols.FILE_PATH + + " ='"+resourceId+"'"; + ResultSet rs = statement.executeQuery(sql); + int resultCount = 0; + while (rs.next()) { + containerId = rs.getInt(1); + resultCount++; + } + rs.close(); + + if (resultCount == 0) { + LOGGER.warn("Should have found a container for: "+resourceId); + } else if (resultCount > 1) { + LOGGER.error("Records ids should be unique:"+resourceId); + } +/* + if (containerId < 0) { + System.err.println("CONTAINER ID < 0!!!"); + sql = "SELECT MAX("+ Cols.CONTAINER_ID.name() + + ") from "+ExtractProfiler.CONTAINER_TABLE.getName(); + rs = statement.executeQuery(sql); + while (rs.next()) { + containerId = rs.getInt(1); + } + rs.close(); + if (containerId < 0) { + //log and abort + //return -1? + } else { + containerId++; + } + + }*/ + return containerId; + } + + + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java new file mode 100644 index 0000000..2c655cc --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.batch; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.batch.ConsumersManager; +import org.apache.tika.batch.FileResourceConsumer; +import org.apache.tika.eval.AbstractProfiler; +import org.apache.tika.eval.XMLErrorLogUpdater; +import org.apache.tika.eval.db.DBUtil; +import org.apache.tika.eval.db.TableInfo; + + +public class DBConsumersManager extends ConsumersManager { + + private Connection conn; + List<LogTablePair> errorLogs = new ArrayList<>(); + + public DBConsumersManager(DBUtil dbUtil, List<FileResourceConsumer> consumers) + throws IOException { + super(consumers); + this.conn = dbUtil.getConnection(true); + } + + + @Override + public void shutdown() { + + for (FileResourceConsumer consumer : getConsumers()) { + if (consumer instanceof AbstractProfiler) { + try{ + ((AbstractProfiler)consumer).closeWriter(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + //MUST HAPPEN AFTER consumers have closed and + //committed container information!!! + XMLErrorLogUpdater up = new XMLErrorLogUpdater(); + for (LogTablePair p : errorLogs) { + try { + up.update(conn, p.tableInfo, p.log); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + try { + conn.commit(); + } catch (SQLException e) { + throw new RuntimeException(e); + } + try { + conn.close(); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + public void addErrorLogTablePair(Path log, TableInfo tableName) { + LogTablePair p = new LogTablePair(); + p.log = log; + p.tableInfo = tableName; + errorLogs.add(p); + } + + class LogTablePair { + Path log; + TableInfo tableInfo; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java new file mode 100644 index 0000000..8135887 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.batch; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.atomic.AtomicInteger; + +import org.apache.tika.batch.FileResource; +import org.apache.tika.batch.FileResourceConsumer; +import org.apache.tika.eval.AbstractProfiler; +import org.apache.tika.eval.db.Cols; +import org.apache.tika.eval.db.DBUtil; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.eval.io.ExtractReader; +import org.apache.tika.eval.io.IDBWriter; + +public abstract class EvalConsumerBuilder { + private AtomicInteger count = new AtomicInteger(0); + protected ArrayBlockingQueue<FileResource> queue; + Map<String, String> localAttrs; + DBUtil dbUtil; + + public void init(ArrayBlockingQueue<FileResource> queue, Map<String, String> localAttrs, + DBUtil dbUtil) { + this.queue = queue; + this.localAttrs = localAttrs; + this.dbUtil = dbUtil; + } + + public abstract FileResourceConsumer build() throws IOException, SQLException; + + protected abstract List<TableInfo> getTableInfo(); + + protected abstract IDBWriter getDBWriter() throws IOException, SQLException; + + protected abstract void addErrorLogTablePairs(DBConsumersManager manager); + + public void populateRefTables(IDBWriter writer) throws IOException, SQLException { + //figure out cleaner way of doing this! + if (count.getAndIncrement() > 0) { + return; + } + Map<Cols, String> m = new HashMap<>(); + for (AbstractProfiler.PARSE_ERROR_TYPE t : AbstractProfiler.PARSE_ERROR_TYPE.values()) { + m.clear(); + m.put(Cols.PARSE_ERROR_TYPE_ID, Integer.toString(t.ordinal())); + m.put(Cols.PARSE_ERROR_DESCRIPTION, t.name()); + writer.writeRow(AbstractProfiler.REF_PARSE_ERROR_TYPES, m); + } + + for (AbstractProfiler.EXCEPTION_TYPE t : AbstractProfiler.EXCEPTION_TYPE.values()) { + m.clear(); + m.put(Cols.PARSE_EXCEPTION_TYPE_ID, Integer.toString(t.ordinal())); + m.put(Cols.PARSE_EXCEPTION_DESCRIPTION, t.name()); + writer.writeRow(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES, m); + } + + for (AbstractProfiler.EXTRACT_ERROR_TYPE t : + AbstractProfiler.EXTRACT_ERROR_TYPE.values()) { + m.clear(); + m.put(Cols.EXTRACT_ERROR_TYPE_ID, Integer.toString(t.ordinal())); + m.put(Cols.EXTRACT_ERROR_DESCRIPTION, t.name()); + writer.writeRow(AbstractProfiler.REF_EXTRACT_ERROR_TYPES, m); + } + + } + + ExtractReader.ALTER_METADATA_LIST getAlterMetadata(Map<String, String> localAttrs) { + + String alterExtractString = localAttrs.get("alterExtract"); + ExtractReader.ALTER_METADATA_LIST alterExtractList = ExtractReader.ALTER_METADATA_LIST.AS_IS; + if (alterExtractString == null || alterExtractString.equalsIgnoreCase("as_is")) { + alterExtractList = ExtractReader.ALTER_METADATA_LIST.AS_IS; + } else if (alterExtractString.equalsIgnoreCase("first_only")) { + alterExtractList = ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY; + } else if (alterExtractString.equalsIgnoreCase("concatenate_content")) { + alterExtractList = ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST; + } else { + throw new RuntimeException("options for alterExtract: as_is, first_only, concatenate_content." + + " I don't understand:" + alterExtractString); + } + return alterExtractList; + } + + +/* + public abstract Map<String, String> getIndexInfo(); + + class ValueComparator implements Comparator<String> { + + Map<String, ColInfo> map; + + public ValueComparator(Map<String, ColInfo> base) { + this.map = base; + } + + public int compare(String a, String b) { + Integer aVal = map.get(a).getDBColOffset(); + Integer bVal = map.get(b).getDBColOffset(); + if (aVal == null || bVal == null) { + throw new IllegalArgumentException("Column offset must be specified!"); + } + if (aVal == bVal && ! map.get(a).equals(map.get(b))) { + throw new IllegalArgumentException("Column offsets must be unique: " + a + " and " + b + " both have: "+aVal); + } + if (aVal < bVal) { + return -1; + } else { + return 1; + } + } + } +*/ +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java new file mode 100644 index 0000000..00f4ad7 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.batch; + + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.sql.SQLException; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ArrayBlockingQueue; + +import org.apache.tika.batch.ConsumersManager; +import org.apache.tika.batch.FileResource; +import org.apache.tika.batch.FileResourceConsumer; +import org.apache.tika.batch.builders.AbstractConsumersBuilder; +import org.apache.tika.batch.builders.BatchProcessBuilder; +import org.apache.tika.eval.AbstractProfiler; +import org.apache.tika.eval.db.DBUtil; +import org.apache.tika.eval.db.H2Util; +import org.apache.tika.eval.util.LanguageIDWrapper; +import org.apache.tika.util.ClassLoaderUtil; +import org.apache.tika.util.PropsUtil; +import org.apache.tika.util.XMLDOMUtil; +import org.w3c.dom.Node; + +public class EvalConsumersBuilder extends AbstractConsumersBuilder { + + @Override + public ConsumersManager build(Node node, Map<String, String> runtimeAttributes, + ArrayBlockingQueue<FileResource> queue) { + + List<FileResourceConsumer> consumers = new LinkedList<>(); + int numConsumers = BatchProcessBuilder.getNumConsumers(runtimeAttributes); + + Map<String, String> localAttrs = XMLDOMUtil.mapifyAttrs(node, runtimeAttributes); + + + Path db = getPath(localAttrs, "db"); + Path langModelDir = getPath(localAttrs, "langModelDir"); + + try { + if (langModelDir == null) { + LanguageIDWrapper.loadBuiltInModels(); + } else { + LanguageIDWrapper.loadModels(langModelDir); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + + Path commonTokens = getNonNullPath(localAttrs, "commonTokens"); + try { + AbstractProfiler.loadCommonTokens(commonTokens); + } catch (IOException e) { + throw new RuntimeException(e); + } + + boolean append = PropsUtil.getBoolean(localAttrs.get("dbAppend"), false); + + if (db == null) { + throw new RuntimeException("Must specify: -db"); + } + //parameterize which db util to use + DBUtil util = new H2Util(db); + EvalConsumerBuilder consumerBuilder = ClassLoaderUtil.buildClass(EvalConsumerBuilder.class, + PropsUtil.getString(localAttrs.get("consumerBuilderClass"), null)); + if (consumerBuilder == null) { + throw new RuntimeException("Must specify consumerBuilderClass in config file"); + } + consumerBuilder.init(queue, localAttrs, util); + + try { + util.createDB(consumerBuilder.getTableInfo(), append); + } catch (SQLException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + for (int i = 0; i < numConsumers; i++) { + try { + consumers.add(consumerBuilder.build()); + } catch (IOException | SQLException e) { + throw new RuntimeException(e); + } + } + + DBConsumersManager manager; + try { + manager = new DBConsumersManager(util, consumers); + } catch (IOException e) { + throw new RuntimeException(e); + } + consumerBuilder.addErrorLogTablePairs(manager); + + return manager; + } + + private Path getNonNullPath(Map<String, String> attrs, String key) { + Path p = getPath(attrs, key); + if (p == null) { + throw new RuntimeException("Must specify a file for this attribute: "+key); + } + return p; + } + + + protected Path getPath(Map<String, String> attrs, String key) { + String filePath = attrs.get(key); + if (filePath == null) { + return null; + } + return Paths.get(filePath); + } + + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/batch/FileComparerBuilder.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/FileComparerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileComparerBuilder.java new file mode 100644 index 0000000..cface16 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/FileComparerBuilder.java @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.batch; + + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.batch.FileResourceConsumer; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.eval.AbstractProfiler; +import org.apache.tika.eval.ExtractComparer; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.eval.io.DBWriter; +import org.apache.tika.eval.io.ExtractReader; +import org.apache.tika.eval.io.IDBWriter; +import org.apache.tika.util.PropsUtil; + +public class FileComparerBuilder extends EvalConsumerBuilder { + private final static String WHICH_DB = "h2";//TODO: allow flexibility + + + @Override + public FileResourceConsumer build() throws IOException, SQLException { + Path thisRootDir = PropsUtil.getPath(localAttrs.get("extractsA"), null); + if (thisRootDir == null) { + throw new RuntimeException("Must specify \"extractsA\" -- directory for 'A' extracts"); + } + Path thatRootDir = PropsUtil.getPath(localAttrs.get("extractsB"), null); + if (thatRootDir == null) { + throw new RuntimeException("Must specify \"extractsB\" -- directory for 'B' extracts"); + } + + Path inputRootDir = PropsUtil.getPath(localAttrs.get("inputDir"), null); + + long minJsonLength = PropsUtil.getLong(localAttrs.get("minJsonFileSizeBytes"), -1L); + long maxJsonLength = PropsUtil.getLong(localAttrs.get("maxJsonFileSizeBytes"), -1L); + + ExtractReader.ALTER_METADATA_LIST alterExtractList = getAlterMetadata(localAttrs); + + + IDBWriter writer = getDBWriter(); + //TODO: clean up the writing of the ref tables!!! + try { + populateRefTables(writer); + } catch (SQLException e) { + throw new RuntimeException("Can't populate ref tables", e); + } + + if (inputRootDir == null) { + //this is for the sake of the crawler + throw new RuntimeException("Must specify an -inputDir"); + } + + return new ExtractComparer(queue, inputRootDir, thisRootDir, thatRootDir, writer, + minJsonLength, maxJsonLength, alterExtractList); + } + + @Override + protected List<TableInfo> getTableInfo() { + List<TableInfo> tableInfos = new ArrayList<>(); + tableInfos.add(ExtractComparer.COMPARISON_CONTAINERS); + tableInfos.add(ExtractComparer.PROFILES_A); + tableInfos.add(ExtractComparer.PROFILES_B); + tableInfos.add(ExtractComparer.ERROR_TABLE_A); + tableInfos.add(ExtractComparer.ERROR_TABLE_B); + tableInfos.add(ExtractComparer.EXCEPTION_TABLE_A); + tableInfos.add(ExtractComparer.EXCEPTION_TABLE_B); + tableInfos.add(ExtractComparer.ERROR_TABLE_A); + tableInfos.add(ExtractComparer.ERROR_TABLE_B); + tableInfos.add(ExtractComparer.CONTENTS_TABLE_A); + tableInfos.add(ExtractComparer.CONTENTS_TABLE_B); + tableInfos.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_A); + tableInfos.add(ExtractComparer.EMBEDDED_FILE_PATH_TABLE_B); + + tableInfos.add(ExtractComparer.CONTENT_COMPARISONS); + tableInfos.add(AbstractProfiler.MIME_TABLE); + tableInfos.add(ExtractComparer.REF_PAIR_NAMES); + tableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES); + tableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES); + tableInfos.add(AbstractProfiler.REF_EXTRACT_ERROR_TYPES); + return tableInfos; + } + + @Override + protected IDBWriter getDBWriter() throws IOException, SQLException { + return new DBWriter(getTableInfo(), TikaConfig.getDefaultConfig(), dbUtil); + } + + @Override + protected void addErrorLogTablePairs(DBConsumersManager manager) { + Path errorLogA = PropsUtil.getPath(localAttrs.get("errorLogFileA"), null); + if (errorLogA == null) { + return; + } + manager.addErrorLogTablePair(errorLogA, ExtractComparer.ERROR_TABLE_A); + Path errorLogB = PropsUtil.getPath(localAttrs.get("errorLogFileB"), null); + if (errorLogB == null) { + return; + } + manager.addErrorLogTablePair(errorLogB, ExtractComparer.ERROR_TABLE_B); + + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java b/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java new file mode 100644 index 0000000..de8be64 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.batch; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.batch.FileResourceConsumer; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.eval.AbstractProfiler; +import org.apache.tika.eval.ExtractProfiler; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.eval.io.DBWriter; +import org.apache.tika.eval.io.ExtractReader; +import org.apache.tika.eval.io.IDBWriter; +import org.apache.tika.util.PropsUtil; + + +public class SingleFileConsumerBuilder extends EvalConsumerBuilder { + + @Override + public FileResourceConsumer build() throws IOException { + Path extractDir = PropsUtil.getPath(localAttrs.get("extractDir"), null); + if (extractDir == null) { + throw new RuntimeException("Must specify \"extractDir\" -- directory to crawl"); + } + if (!Files.isDirectory(extractDir)) { + throw new RuntimeException("ROOT DIRECTORY DOES NOT EXIST: " + + extractDir.toAbsolutePath()); + } + + Path inputDir = PropsUtil.getPath(localAttrs.get("inputDir"), null); + + ExtractReader.ALTER_METADATA_LIST alterExtractList = getAlterMetadata(localAttrs); + + IDBWriter writer = null; + try { + writer = getDBWriter(); + } catch (SQLException ex) { + throw new IOException(ex); + } + + //TODO: clean up the writing of the ref tables!!! + try { + populateRefTables(writer); + } catch (SQLException e) { + throw new RuntimeException("Can't populate ref tables", e); + } + //we _could_ set this to extractDir (if not null) + //here, but the Crawler defaults to "input" if nothing is passed + //so this won't work + if (inputDir == null) { + throw new RuntimeException("Must specify -inputDir"); + } + if (extractDir == null && inputDir != null) { + extractDir = inputDir; + } + return new ExtractProfiler(queue, inputDir, extractDir, writer, alterExtractList); + } + + @Override + protected List<TableInfo> getTableInfo() { + List<TableInfo> tableInfos = new ArrayList<TableInfo>(); + tableInfos.add(AbstractProfiler.MIME_TABLE); + tableInfos.add(AbstractProfiler.REF_PARSE_ERROR_TYPES); + tableInfos.add(AbstractProfiler.REF_PARSE_EXCEPTION_TYPES); + tableInfos.add(AbstractProfiler.REF_EXTRACT_ERROR_TYPES); + tableInfos.add(ExtractProfiler.CONTAINER_TABLE); + tableInfos.add(ExtractProfiler.PROFILE_TABLE); + tableInfos.add(ExtractProfiler.ERROR_TABLE); + tableInfos.add(ExtractProfiler.EXCEPTION_TABLE); + tableInfos.add(ExtractProfiler.CONTENTS_TABLE); + tableInfos.add(ExtractProfiler.EMBEDDED_FILE_PATH_TABLE); + return tableInfos; + } + + @Override + protected IDBWriter getDBWriter() throws IOException, SQLException { + return new DBWriter(getTableInfo(), TikaConfig.getDefaultConfig(), dbUtil); + } + + @Override + protected void addErrorLogTablePairs(DBConsumersManager manager) { + Path errorLog = PropsUtil.getPath(localAttrs.get("errorLogFile"), null); + if (errorLog == null) { + return; + } + manager.addErrorLogTablePair(errorLog, ExtractProfiler.ERROR_TABLE); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java new file mode 100644 index 0000000..baa7994 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.db; + +import java.sql.SQLException; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantReadWriteLock; + + +/** + * Abstract buffer for map of values and unique ids. + * <p> + * Use this for fast in memory lookups of smallish sets of values. + * + */ +abstract class AbstractDBBuffer { + + private final Map<String, Integer> m = new HashMap<>(); + private final ReentrantReadWriteLock rwl = new ReentrantReadWriteLock(); + private final Lock r = rwl.readLock(); + private final Lock w = rwl.writeLock(); + + private int numWrites = 0; + + public int getId(String key) { + r.lock(); + try { + Integer v = m.get(key); + if (v != null) { + return v; + } + } finally { + r.unlock(); + } + + try { + w.lock(); + Integer v = m.get(key); + if (v != null) { + return v; + } + v = m.size()+1; + m.put(key, v); + write(v, key); + numWrites++; + return v; + } finally { + w.unlock(); + } + } + + public int getNumWrites() { + return numWrites; + } + + //Odd to throw RuntimeException, I know. It should be + //catastrophic if this buffer can't write to the db. + public abstract void write(int id, String value) throws RuntimeException; + + public abstract void close() throws SQLException; +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java b/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java new file mode 100644 index 0000000..a32f874 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.db; + + +import java.sql.Types; + +public class ColInfo { + private final Cols name; + private final int type; + private final Integer precision; + private final String constraints; + + public ColInfo(Cols name, int type) { + this(name, type, null, null); + } + + public ColInfo(Cols name, int type, String constraints) { + this(name, type, null, constraints); + } + + public ColInfo(Cols name, int type, Integer precision) { + this(name, type, precision, null); + } + + + public ColInfo(Cols name, int type, Integer precision, String constraints) { + this.name = name; + this.type = type; + this.precision = precision; + this.constraints = constraints; + } + + public int getType() { + return type; + } + + public Cols getName() { + return name; + } + /** + * + * @return constraints string or null + */ + public String getConstraints() { + return constraints; + } + + /** + * Gets the precision. This can be null! + * @return precision or null + */ + public Integer getPrecision() { + return precision; + } + + public String getSqlDef() { + if (type == Types.VARCHAR){ + return "VARCHAR("+precision+")"; + } else if (type == Types.CHAR) { + return "CHAR("+precision+")"; + } + switch (type) { + case Types.FLOAT : + return "FLOAT"; + case Types.DOUBLE : + return "DOUBLE"; + case Types.BLOB : + return "BLOB"; + case Types.INTEGER : + return "INTEGER"; + case Types.BIGINT : + return "BIGINT"; + case Types.BOOLEAN : + return "BOOLEAN"; + } + throw new UnsupportedOperationException("Don't yet recognize a type for: "+type); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + ColInfo colInfo = (ColInfo) o; + + if (type != colInfo.type) return false; + if (name != colInfo.name) return false; + if (precision != null ? !precision.equals(colInfo.precision) : colInfo.precision != null) return false; + return !(constraints != null ? !constraints.equals(colInfo.constraints) : colInfo.constraints != null); + + } + + @Override + public int hashCode() { + int result = name != null ? name.hashCode() : 0; + result = 31 * result + type; + result = 31 * result + (precision != null ? precision.hashCode() : 0); + result = 31 * result + (constraints != null ? constraints.hashCode() : 0); + return result; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java new file mode 100644 index 0000000..0dff9f7 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.db; + +public enum Cols { + //container table + CONTAINER_ID, + FILE_PATH, + EXTRACT_FILE_LENGTH, + + EXTRACT_FILE_LENGTH_A, //for comparisons + EXTRACT_FILE_LENGTH_B, + + //profile table + ID, + LENGTH, + FILE_NAME, + FILE_EXTENSION, + ELAPSED_TIME_MILLIS, + NUM_METADATA_VALUES, + IS_EMBEDDED, + EMBEDDED_FILE_PATH, + MIME_TYPE_ID, + MD5, + NUM_ATTACHMENTS, + HAS_CONTENT, + + //content + CONTENT_LENGTH, + NUM_UNIQUE_TOKENS, + NUM_TOKENS, + NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens + COMMON_TOKENS_LANG, //which language was used for the common tokens metric? + NUM_COMMON_TOKENS, + TOP_N_TOKENS, + LANG_ID_1, + LANG_ID_PROB_1, + LANG_ID_2, + LANG_ID_PROB_2, + TOKEN_ENTROPY_RATE, + TOKEN_LENGTH_SUM, + TOKEN_LENGTH_MEAN, + TOKEN_LENGTH_STD_DEV, + UNICODE_CHAR_BLOCKS, + NUM_PAGES, //number of pages a document alleges it has + + //content comparisons + TOP_10_UNIQUE_TOKEN_DIFFS_A, + TOP_10_UNIQUE_TOKEN_DIFFS_B, + TOP_10_MORE_IN_A, + TOP_10_MORE_IN_B, + OVERLAP, + DICE_COEFFICIENT, + + //errors + PARSE_ERROR_TYPE_ID, + + PARSE_ERROR_DESCRIPTION, + PARSE_EXCEPTION_DESCRIPTION, + + EXTRACT_ERROR_TYPE_ID, + EXTRACT_ERROR_DESCRIPTION, + + + //exceptions + ORIG_STACK_TRACE, + SORT_STACK_TRACE, + PARSE_EXCEPTION_TYPE_ID, + + + MIME_STRING,//string representation of mime type + + DIR_NAME_A,//for comparisons in REF_PAIR_NAMES + DIR_NAME_B + } + http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java new file mode 100644 index 0000000..8d936c2 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.db; + + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; + +public class DBBuffer extends AbstractDBBuffer { + + private final PreparedStatement st; + + public DBBuffer(Connection connection, String tableName, + String idColumnName, String valueColumnName) throws SQLException { + st = connection.prepareStatement("insert into "+tableName+ "( "+ + idColumnName + ", " + valueColumnName+") values (?,?);"); + } + + @Override + public void write(int id, String value) throws RuntimeException { + try { + st.clearParameters(); + st.setInt(1, id); + st.setString(2, value); + st.execute(); + + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @Override + public void close() throws SQLException { + st.close(); + + } + + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java b/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java new file mode 100644 index 0000000..1efa48a --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.db; + + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Statement; +import java.sql.Types; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import org.apache.log4j.Logger; +import org.apache.tika.io.IOExceptionWithCause; + +public abstract class DBUtil { + + public static Logger logger = Logger.getLogger(DBUtil.class); + public abstract String getJDBCDriverClass(); + public abstract boolean dropTableIfExists(Connection conn, String tableName) throws SQLException; + private final Path db; + public DBUtil(Path db) { + this.db = db; + } + + /** + * This is intended for a file/directory based db. + * <p> + * Override this any optimizations you want to do on the db + * before writing/reading. + * + * @return + * @throws IOException + */ + public Connection getConnection(boolean createIfDoesntExist) throws IOException { + String connectionString = getConnectionString(db, createIfDoesntExist); + Connection conn = null; + try { + try { + Class.forName(getJDBCDriverClass()); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + conn = DriverManager.getConnection(connectionString); + conn.setAutoCommit(false); + } catch (SQLException e) { + throw new IOExceptionWithCause(e); + } + return conn; + } + + abstract public String getConnectionString(Path db, boolean createIfDoesntExist); + + /** + * + * @param connection + * @return a list of uppercased table names + * @throws SQLException + */ + abstract public Set<String> getTables(Connection connection) throws SQLException; + + public static int insert(PreparedStatement insertStatement, + TableInfo table, + Map<Cols, String> data) throws SQLException { + + //clear parameters before setting + insertStatement.clearParameters(); + try { + int i = 1; + for (ColInfo colInfo : table.getColInfos()) { + updateInsertStatement(i, insertStatement, colInfo, data.get(colInfo.getName())); + i++; + } + for (Cols c : data.keySet()) { + if (! table.containsColumn(c)) { + throw new IllegalArgumentException("Can't add data to "+c + + " because it doesn't exist in the table: "+table.getName()); + } + } + return insertStatement.executeUpdate(); + } catch (SQLException e) { + logger.warn("couldn't insert data for this row: "+e.getMessage()); + return -1; + } + } + + public static void updateInsertStatement(int dbColOffset, PreparedStatement st, + ColInfo colInfo, String value ) throws SQLException { + if (value == null) { + st.setNull(dbColOffset, colInfo.getType()); + return; + } + try { + switch (colInfo.getType()) { + case Types.VARCHAR: + if (value != null && value.length() > colInfo.getPrecision()) { + value = value.substring(0, colInfo.getPrecision()); + logger.warn("truncated varchar value in " + colInfo.getName() + " : "+value); + } + st.setString(dbColOffset, value); + break; + case Types.CHAR: + st.setString(dbColOffset, value); + break; + case Types.DOUBLE: + st.setDouble(dbColOffset, Double.parseDouble(value)); + break; + case Types.FLOAT: + st.setDouble(dbColOffset, Float.parseFloat(value)); + break; + case Types.INTEGER: + st.setDouble(dbColOffset, Integer.parseInt(value)); + break; + case Types.BIGINT: + st.setLong(dbColOffset, Long.parseLong(value)); + break; + case Types.BOOLEAN: + st.setBoolean(dbColOffset, Boolean.parseBoolean(value)); + break; + default: + throw new UnsupportedOperationException("Don't yet support type: " + colInfo.getType()); + } + } catch (NumberFormatException e) { + if (! "".equals(value)) { + logger.warn("number format exception: " + colInfo.getName() + " : " + value); + } + st.setNull(dbColOffset, colInfo.getType()); + } catch (SQLException e) { + logger.warn("sqlexception: "+colInfo+ " : " + value); + st.setNull(dbColOffset, colInfo.getType()); + } + } + + public void createDB(List<TableInfo> tableInfos, boolean append) throws SQLException, IOException { + Connection conn = getConnection(true); + Set<String> tables = getTables(conn); + + for (TableInfo tableInfo : tableInfos) { + + if (append && tables.contains(tableInfo.getName().toUpperCase(Locale.ROOT))) { + continue; + } + if (! append) { + dropTableIfExists(conn, tableInfo.getName()); + } + createTable(conn, tableInfo); + } + + conn.commit(); + conn.close(); + } + + private void createTable(Connection conn, TableInfo tableInfo) throws SQLException { + StringBuilder createSql = new StringBuilder(); + createSql.append("CREATE TABLE "+tableInfo.getName()); + createSql.append("("); + + int last = 0; + for (ColInfo col : tableInfo.getColInfos()) { + last++; + if (last > 1) { + createSql.append(", "); + } + createSql.append(col.getName()); + createSql.append(" "); + createSql.append(col.getSqlDef()); + String constraints = col.getConstraints(); + if (constraints != null) { + createSql.append(" "); + createSql.append(constraints); + } + } + createSql.append(")"); + Statement st = conn.createStatement(); + st.execute(createSql.toString()); + + st.close(); + conn.commit(); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java b/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java new file mode 100644 index 0000000..0b3ebac --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.db; + +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.io.FilenameUtils; + + +public class H2Util extends DBUtil { + + public H2Util(Path db) { + super(db); + } + + @Override + public String getJDBCDriverClass() { + return "org.h2.Driver"; + } + + @Override + public boolean dropTableIfExists(Connection conn, String tableName) throws SQLException { + Statement st = conn.createStatement(); + String sql = "drop table if exists "+tableName; + boolean success = st.execute(sql); + st.close(); + return success; + } + + @Override + public String getConnectionString(Path db, boolean createIfDoesntExist) { + String s = "jdbc:h2:"+ FilenameUtils.separatorsToUnix(db.toAbsolutePath().toString()); + if (! createIfDoesntExist) { + s += ";IFEXISTS=TRUE"; + } + return s; + } + + @Override + public Set<String> getTables(Connection connection) throws SQLException { + String sql = "SHOW TABLES"; + Statement st = connection.createStatement(); + ResultSet rs = st.executeQuery(sql); + Set<String> tables = new HashSet<>(); + while (rs.next()) { + String table = rs.getString(1); + tables.add(table); + } + return tables; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java new file mode 100644 index 0000000..11a72f8 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.db; + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.sql.Types; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.eval.AbstractProfiler; +import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.mime.MimeTypes; + + +public class MimeBuffer extends AbstractDBBuffer { + + private final PreparedStatement st; + private final TikaConfig config; + + public MimeBuffer(Connection connection, TikaConfig config) throws SQLException { + st = connection.prepareStatement("insert into " + AbstractProfiler.MIME_TABLE.getName() + "( " + + Cols.MIME_TYPE_ID.name() + ", " + + Cols.MIME_STRING + ", " + + Cols.FILE_EXTENSION + ") values (?,?,?);"); + this.config = config; + } + + @Override + public void write(int id, String value) throws RuntimeException { + try { + st.clearParameters(); + st.setInt(1, id); + st.setString(2, value); + try { + String ext = MimeUtil.getExtension(value, config); + if (ext == null || ext.length() == 0) { + st.setNull(3, Types.VARCHAR); + } else { + st.setString(3, ext); + } + } catch (MimeTypeException e) { + st.setNull(3, Types.VARCHAR); + } + st.execute(); + + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + @Override + public void close() throws SQLException { + st.close(); + } + + private static class MimeUtil { + //TODO: see if MimeType now works for these + private static final String APPLICATION = "application"; + private static final String TEXT = "text"; + private static final String HTML = "html"; + private static final String XML = "xml"; + private static final String XHTML_XML = "xhtml+xml"; + private static final String CSS = "css"; + private static final String CSV = "csv"; + private static final String PLAIN = "plain"; + private static final String EMPTY_STRING = ""; + + /** + * Utility method to convert from a string value representing a content type + * (e.g. "application/pdf") into the most common extension for that file type + * (e.g. "pdf"). + * <p> + * This will has special handling for texty filetypes whose MimeTypes + * don't currently return anything for {@link org.apache.tika.mime.MimeType#getExtension}; + * + * @param contentType string representing a content type, for example: "application/pdf" + * @param config config from which to get MimeRepository + * @return extension or empty string + * @throws org.apache.tika.mime.MimeTypeException thrown if MimeTypes can't parse the contentType + */ + public static String getExtension(String contentType, TikaConfig config) + throws MimeTypeException { + MimeTypes types = config.getMimeRepository(); + MimeType mime = types.forName(contentType); + return getExtension(mime); + } + + public static String getExtension(MimeType mime) { + + String ext = mime.getExtension(); + if (ext.startsWith(".")) { + ext = ext.substring(1); + } + + //special handling for text/html/xml + if (ext.length() == 0) { + ext = tryTextyTypes(mime.getType()); + } + return ext; + } + + private static String tryTextyTypes(MediaType mediaType) { + + String type = mediaType.getType(); + String subtype = mediaType.getSubtype(); + if (type.equals(TEXT)) { + if (subtype.equals(HTML)) { + return HTML; + } else if (subtype.equals(PLAIN)) { + return "txt"; + } else if (subtype.equals(CSS)) { + return CSS; + } else if (subtype.equals(CSV)) { + return CSV; + } + } else if (type.equals(APPLICATION)) { + if (subtype.equals(XML)) { + return XML; + } else if (subtype.equals(XHTML_XML)) { + return "html"; + } + } + return EMPTY_STRING; + } + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java b/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java new file mode 100644 index 0000000..c6e3696 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.db; + + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class TableInfo { + + private final String name; + private final List<ColInfo> colInfos = new ArrayList<>(); + private final Set<Cols> colNames = new HashSet<>(); + + public TableInfo(String name, ColInfo... cols) { + Collections.addAll(colInfos, cols); + Collections.unmodifiableList(colInfos); + this.name = name; + for (ColInfo c : colInfos) { + assert (!colNames.contains(c.getName())); + colNames.add(c.getName()); + } + } + + public TableInfo(String name, List<ColInfo> cols) { + colInfos.addAll(cols); + Collections.unmodifiableList(colInfos); + this.name = name; + for (ColInfo c : colInfos) { + assert (!colNames.contains(c.getName())); + colNames.add(c.getName()); + } + } + + public String getName() { + return name; + } + + public List<ColInfo> getColInfos() { + return colInfos; + } + + public boolean containsColumn(Cols cols) { + return colNames.contains(cols); + } +} + http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java new file mode 100644 index 0000000..db4cd04 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.io; + +import java.io.IOException; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.SQLException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.log4j.Logger; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.eval.db.ColInfo; +import org.apache.tika.eval.db.Cols; +import org.apache.tika.eval.db.DBUtil; +import org.apache.tika.eval.db.MimeBuffer; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.io.IOExceptionWithCause; + +/** + * This is still in its early stages. The idea is to + * get something working with h2 and then add to that + * as necessary. + * + * Beware, this deletes the db file with each initialization. + */ +public class DBWriter implements IDBWriter { + + private static Logger logger = Logger.getLogger(DBWriter.class); + private final AtomicLong insertedRows = new AtomicLong(); + private final Long commitEveryX = 1000L; + + private final List<TableInfo> tableInfos; + private final Connection conn; + private final DBUtil dbUtil; + private static MimeBuffer mimeBuffer; + + //<tableName, preparedStatement> + private final Map<String, PreparedStatement> inserts = new HashMap<>(); + + public DBWriter(List<TableInfo> tableInfos, TikaConfig tikaConfig, DBUtil dbUtil) + throws IOException, SQLException { + + this.conn = dbUtil.getConnection(true); + if (mimeBuffer == null) { + mimeBuffer = new MimeBuffer(conn, tikaConfig); + } + this.tableInfos = tableInfos; + this.dbUtil = dbUtil; + for (TableInfo tableInfo : tableInfos) { + try { + PreparedStatement st = createPreparedInsert(tableInfo); + inserts.put(tableInfo.getName(), st); + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + } + + public int getMimeId(String mimeString) { + return mimeBuffer.getId(mimeString); + } + + private PreparedStatement createPreparedInsert(TableInfo tableInfo) throws SQLException { + StringBuilder sb = new StringBuilder(); + sb.append("INSERT INTO ").append(tableInfo.getName()); + sb.append("("); + int i = 0; + for (ColInfo c : tableInfo.getColInfos()) { + if (i++ > 0) { + sb.append(", "); + } + sb.append(c.getName()); + } + sb.append(") "); + + sb.append("VALUES"); + sb.append("("); + for (int j = 0; j < i; j++) { + if (j > 0) { + sb.append(", "); + } + sb.append("?"); + } + sb.append(")"); + + return conn.prepareStatement(sb.toString()); + } + + + public void writeRow(TableInfo table, Map<Cols, String> data) throws IOException { + try { + PreparedStatement p = inserts.get(table.getName()); + if (p == null) { + throw new RuntimeException("Failed to create prepared statement for: "+ + table.getName()); + } + dbUtil.insert(p, table, data); + long rows = insertedRows.incrementAndGet(); + if (rows % commitEveryX == 0) { + logger.info("writer is committing after "+ rows + " rows"); + conn.commit(); + } + } catch (SQLException e) { + throw new IOException(e); + } + } + + public void close() throws IOException { + try { + mimeBuffer.close(); + conn.commit(); + } catch (SQLException e){ + e.printStackTrace(); + throw new IOExceptionWithCause(e); + } + try { + conn.close(); + } catch (SQLException e) { + throw new IOExceptionWithCause(e); + } + + } +}
