TIKA-1332 initial commit of tika-eval. More work remains.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5e49c330 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5e49c330 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5e49c330 Branch: refs/heads/2.x Commit: 5e49c33087bbf03763b05efda3bbb96d8cc20ea4 Parents: 6bfe5d5 Author: tballison <[email protected]> Authored: Thu Feb 16 12:19:54 2017 -0500 Committer: tballison <[email protected]> Committed: Thu Feb 16 12:19:54 2017 -0500 ---------------------------------------------------------------------- CHANGES.txt | 2 + LICENSE.txt | 8 + pom.xml | 1 + tika-eval/pom.xml | 281 +++++++ .../org/apache/tika/eval/AbstractProfiler.java | 693 ++++++++++++++++ .../org/apache/tika/eval/EvalFilePaths.java | 108 +++ .../org/apache/tika/eval/ExtractComparer.java | 455 +++++++++++ .../org/apache/tika/eval/ExtractProfiler.java | 238 ++++++ .../java/org/apache/tika/eval/TikaEvalCLI.java | 262 ++++++ .../apache/tika/eval/XMLErrorLogUpdater.java | 226 ++++++ .../tika/eval/batch/DBConsumersManager.java | 92 +++ .../tika/eval/batch/EvalConsumerBuilder.java | 134 ++++ .../tika/eval/batch/EvalConsumersBuilder.java | 133 ++++ .../tika/eval/batch/FileComparerBuilder.java | 122 +++ .../eval/batch/SingleFileConsumerBuilder.java | 108 +++ .../apache/tika/eval/db/AbstractDBBuffer.java | 77 ++ .../java/org/apache/tika/eval/db/ColInfo.java | 116 +++ .../main/java/org/apache/tika/eval/db/Cols.java | 90 +++ .../java/org/apache/tika/eval/db/DBBuffer.java | 54 ++ .../java/org/apache/tika/eval/db/DBUtil.java | 201 +++++ .../java/org/apache/tika/eval/db/H2Util.java | 71 ++ .../org/apache/tika/eval/db/MimeBuffer.java | 144 ++++ .../java/org/apache/tika/eval/db/TableInfo.java | 64 ++ .../java/org/apache/tika/eval/io/DBWriter.java | 141 ++++ .../org/apache/tika/eval/io/ExtractReader.java | 161 ++++ .../java/org/apache/tika/eval/io/IDBWriter.java | 31 + .../apache/tika/eval/io/XMLLogMsgHandler.java | 26 + .../org/apache/tika/eval/io/XMLLogReader.java | 120 +++ .../org/apache/tika/eval/reports/Report.java | 197 +++++ .../tika/eval/reports/ResultsReporter.java | 295 +++++++ .../tika/eval/reports/XLSXHREFFormatter.java | 79 ++ .../tika/eval/reports/XLSXNumFormatter.java | 54 ++ .../tika/eval/reports/XSLXCellFormatter.java | 30 + .../tokens/AlphaIdeographFilterFactory.java | 74 ++ .../tika/eval/tokens/AnalyzerDeserializer.java | 345 ++++++++ .../tika/eval/tokens/AnalyzerManager.java | 95 +++ .../CJKBigramAwareLengthFilterFactory.java | 74 ++ .../eval/tokens/CommonTokenCountManager.java | 141 ++++ .../tika/eval/tokens/CommonTokenResult.java | 37 + .../tika/eval/tokens/ContrastStatistics.java | 78 ++ .../tika/eval/tokens/TokenContraster.java | 183 +++++ .../eval/tokens/TokenCountPriorityQueue.java | 49 ++ .../apache/tika/eval/tokens/TokenCounter.java | 167 ++++ .../apache/tika/eval/tokens/TokenIntPair.java | 82 ++ .../tika/eval/tokens/TokenStatistics.java | 127 +++ .../tika/eval/util/LanguageIDWrapper.java | 69 ++ ...ache.lucene.analysis.util.TokenFilterFactory | 17 + .../src/main/resources/comparison-reports.xml | 791 +++++++++++++++++++ .../src/main/resources/lucene-analyzers.json | 107 +++ .../src/main/resources/lucene-char-mapping.txt | 2 + .../src/main/resources/profile-reports.xml | 148 ++++ .../resources/tika-eval-comparison-config.xml | 81 ++ .../resources/tika-eval-profiler-config.xml | 76 ++ .../test/java/org/apache/tika/MockDBWriter.java | 73 ++ .../apache/tika/eval/AnalyzerManagerTest.java | 79 ++ .../org/apache/tika/eval/ComparerBatchTest.java | 411 ++++++++++ .../org/apache/tika/eval/ProfilerBatchTest.java | 236 ++++++ .../apache/tika/eval/SimpleComparerTest.java | 289 +++++++ .../org/apache/tika/eval/TikaEvalCLITest.java | 42 + .../apache/tika/eval/db/AbstractBufferTest.java | 160 ++++ .../apache/tika/eval/io/ExtractReaderTest.java | 85 ++ .../tika/eval/io/FatalExceptionReaderTest.java | 32 + .../tika/eval/reports/ResultsReporterTest.java | 60 ++ .../tika/eval/tokens/LuceneTokenCounter.java | 191 +++++ .../tika/eval/tokens/TokenCounterTest.java | 131 +++ .../org/apache/tika/eval/util/MimeUtilTest.java | 65 ++ tika-eval/src/test/resources/commontokens/en | 8 + tika-eval/src/test/resources/commontokens/es | 10 + tika-eval/src/test/resources/commontokens/zh-cn | 8 + tika-eval/src/test/resources/commontokens/zh-tw | 8 + tika-eval/src/test/resources/log4j.properties | 11 + .../src/test/resources/log4j_process.properties | 11 + ...ingle-file-profiler-crawl-extract-config.xml | 72 ++ .../single-file-profiler-crawl-input-config.xml | 73 ++ .../batch-logs/batch-process-fatal.xml | 59 ++ .../test-dirs/extractsA/file1.pdf.json | 5 + .../extractsA/file10_permahang.txt.json | 0 .../test-dirs/extractsA/file11_oom.txt.json | 0 .../test-dirs/extractsA/file12_es.txt.json | 4 + .../extractsA/file13_attachANotB.doc.json | 10 + .../extractsA/file2_attachANotB.doc.json | 10 + .../extractsA/file3_attachBNotA.doc.json | 4 + .../test-dirs/extractsA/file4_emptyB.pdf.json | 4 + .../test-dirs/extractsA/file5_emptyA.pdf.json | 0 .../test-dirs/extractsA/file6_accessEx.pdf.json | 1 + .../test-dirs/extractsA/file7_badJson.pdf.json | 4 + .../test-dirs/extractsA/file8_IOEx.pdf.json | 1 + .../test-dirs/extractsB/file1.pdf.json | 2 + .../test-dirs/extractsB/file11_oom.txt.json | 0 .../test-dirs/extractsB/file12_es.txt.json | 4 + .../extractsB/file13_attachANotB.doc.txt | 1 + .../extractsB/file2_attachANotB.doc.json | 4 + .../extractsB/file3_attachBNotA.doc.json | 10 + .../test-dirs/extractsB/file4_emptyB.pdf.json | 0 .../test-dirs/extractsB/file5_emptyA.pdf.json | 4 + .../test-dirs/extractsB/file6_accessEx.pdf.json | 1 + .../test-dirs/extractsB/file7_badJson.pdf.json | 0 .../test-dirs/extractsB/file8_IOEx.pdf.json | 1 + .../resources/test-dirs/raw_input/file1.pdf | 13 + .../test-dirs/raw_input/file11_oom.txt | 2 + .../test-dirs/raw_input/file2_attachANotB.doc | 13 + .../test-dirs/raw_input/file3_attachBNotA.doc | 13 + .../test-dirs/raw_input/file4_emptyB.pdf | 13 + .../test-dirs/raw_input/file5_emptyA.pdf | 13 + .../test-dirs/raw_input/file6_accessEx.pdf | 13 + .../test-dirs/raw_input/file7_badJson.pdf | 13 + .../test-dirs/raw_input/file8_IOEx.pdf | 13 + .../test-dirs/raw_input/file9_noextract.txt | 1 + 108 files changed, 9848 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/CHANGES.txt ---------------------------------------------------------------------- diff --git a/CHANGES.txt b/CHANGES.txt index 72fc96c..785edfd 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -17,6 +17,8 @@ Release 2.0 - ??? Release 1.15 -??? + * Added tika-eval module (TIKA-1332). + * Add parsers for EMF/WMF files (TIKA-2246/TIKA-2247). * Official mime types for BMP, EMF and WMF have been registered with http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/LICENSE.txt ---------------------------------------------------------------------- diff --git a/LICENSE.txt b/LICENSE.txt index 9576237..0673358 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -370,3 +370,11 @@ JUnRAR (https://github.com/edmund-wagner/junrar/) Sqlite (bundled in org.xerial's sqlite-jdbc) This product bundles Sqlite, which is in the Public Domain. For details see: https://www.sqlite.org/copyright.html + +H2 Database in tika-eval + This software contains unmodified binary redistributions for + H2 database engine (http://www.h2database.com/), + which is dual licensed and available under the MPL 2.0 + (Mozilla Public License) or under the EPL 1.0 (Eclipse Public License). + An original copy of the license agreement can be found at: + http://www.h2database.com/html/license.html \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 7df81eb..8e7ff7b 100644 --- a/pom.xml +++ b/pom.xml @@ -61,6 +61,7 @@ <module>tika-langdetect</module> <module>tika-example</module> <module>tika-java7</module> + <module>tika-eval</module> </modules> <profiles> http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/pom.xml ---------------------------------------------------------------------- diff --git a/tika-eval/pom.xml b/tika-eval/pom.xml new file mode 100644 index 0000000..ee0940c --- /dev/null +++ b/tika-eval/pom.xml @@ -0,0 +1,281 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + + <modelVersion>4.0.0</modelVersion> + <properties> + <cli.version>1.3.1</cli.version> <!--sync version with tika-server or move to parent? --> + <lucene.version>6.2.1</lucene.version> + <poi.version>3.16-beta2</poi.version> + + </properties> + + <parent> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parent</artifactId> + <version>1.15-SNAPSHOT</version> + <relativePath>../tika-parent/pom.xml</relativePath> + </parent> + + <artifactId>tika-eval</artifactId> + <name>Apache Tika eval</name> + <url>http://tika.apache.org/</url> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-batch</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-serialization</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + <version>3.4.1</version> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-csv</artifactId> + <version>1.1</version> + </dependency> + <dependency> + <groupId>com.h2database</groupId> + <artifactId>h2</artifactId> + <version>1.4.193</version> + </dependency> + <dependency> + <groupId>com.optimaize.languagedetector</groupId> + <artifactId>language-detector</artifactId> + <version>0.5</version> + </dependency> + <dependency> + <groupId>commons-cli</groupId> + <artifactId>commons-cli</artifactId> + <version>${cli.version}</version> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>2.4</version> + </dependency> + + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-core</artifactId> + <version>${lucene.version}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-common</artifactId> + <version>${lucene.version}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-analyzers-icu</artifactId> + <version>${lucene.version}</version> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-memory</artifactId> + <version>${lucene.version}</version> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-lang3</artifactId> + <version>3.4</version> + </dependency> + + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi</artifactId> + <version>${poi.version}</version> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi-ooxml</artifactId> + <version>${poi.version}</version> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi-ooxml-schemas</artifactId> + <version>${poi.version}</version> + </dependency> + <dependency> + <groupId>org.apache.poi</groupId> + <artifactId>poi-scratchpad</artifactId> + <version>${poi.version}</version> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-batch</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-core</artifactId> + <version>${project.version}</version> + <type>test-jar</type> + <scope>test</scope> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + + + </dependencies> + <build> + <plugins> + <plugin> + <artifactId>maven-shade-plugin</artifactId> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <createDependencyReducedPom> + false + </createDependencyReducedPom> + <!-- <filters> --> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <mainClass>org.apache.tika.eval.TikaEvalCLI</mainClass> + </transformer> + + <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" /> + </transformers> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <artifactId>maven-remote-resources-plugin</artifactId> + <version>1.5</version> + <executions> + <execution> + <goals> + <goal>bundle</goal> + </goals> + </execution> + </executions> + <configuration> + <includes> + <include>**/*.xml</include> + </includes> + </configuration> + </plugin> + + <plugin> + <groupId>org.apache.felix</groupId> + <artifactId>maven-bundle-plugin</artifactId> + <extensions>true</extensions> + <configuration> + <instructions> + <Bundle-DocURL>${project.url}</Bundle-DocURL> + <Bundle-Activator> + org.apache.tika.config.TikaActivator + </Bundle-Activator> + <Bundle-ActivationPolicy>lazy</Bundle-ActivationPolicy> + </instructions> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.rat</groupId> + <artifactId>apache-rat-plugin</artifactId> + <configuration> + <excludes> + <exclude>src/test/resources/org/apache/tika/**</exclude> + </excludes> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <executions> + <execution> + <goals> + <goal>test-jar</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <artifactId>maven-failsafe-plugin</artifactId> + <version>2.10</version> + <configuration> + <additionalClasspathElements> + <additionalClasspathElement> + ${project.build.directory}/${project.build.finalName}.jar + </additionalClasspathElement> + </additionalClasspathElements> + </configuration> + <executions> + <execution> + <goals> + <goal>integration-test</goal> + <goal>verify</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + + </build> + + + + <organization> + <name>The Apache Software Foundation</name> + <url>http://www.apache.org</url> + </organization> + <scm> + <url>http://svn.apache.org/viewvc/tika/trunk/tika-batch</url> + <connection>scm:svn:http://svn.apache.org/repos/asf/tika/trunk/tika-batch</connection> + <developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/trunk/tika-batch</developerConnection> + </scm> + <issueManagement> + <system>JIRA</system> + <url>https://issues.apache.org/jira/browse/TIKA</url> + </issueManagement> + <ciManagement> + <system>Jenkins</system> + <url>https://builds.apache.org/job/Tika-trunk/</url> + </ciManagement> + + +</project> http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java new file mode 100644 index 0000000..24f7358 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java @@ -0,0 +1,693 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval; + + +import java.io.IOException; +import java.io.Reader; +import java.io.StringReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.sql.Types; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import com.optimaize.langdetect.DetectedLanguage; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.math3.stat.descriptive.SummaryStatistics; +import org.apache.tika.batch.FileResource; +import org.apache.tika.batch.FileResourceConsumer; +import org.apache.tika.batch.fs.FSProperties; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.eval.db.ColInfo; +import org.apache.tika.eval.db.Cols; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.eval.io.IDBWriter; +import org.apache.tika.eval.tokens.AnalyzerManager; +import org.apache.tika.eval.tokens.CommonTokenCountManager; +import org.apache.tika.eval.tokens.CommonTokenResult; +import org.apache.tika.eval.tokens.TokenCounter; +import org.apache.tika.eval.tokens.TokenIntPair; +import org.apache.tika.eval.tokens.TokenStatistics; +import org.apache.tika.eval.util.LanguageIDWrapper; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.PagedText; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.utils.ExceptionUtils; + +public abstract class AbstractProfiler extends FileResourceConsumer { + + private static final String[] EXTRACT_EXTENSIONS = { + ".json", + ".txt", + "" + }; + + private static final String[] COMPRESSION_EXTENSIONS = { + "", + ".bz2", + ".gzip", + ".zip", + }; + static final long NON_EXISTENT_FILE_LENGTH = -1l; + + public static TableInfo REF_EXTRACT_ERROR_TYPES = new TableInfo("ref_extract_error_types", + new ColInfo(Cols.EXTRACT_ERROR_TYPE_ID, Types.INTEGER), + new ColInfo(Cols.EXTRACT_ERROR_DESCRIPTION, Types.VARCHAR, 128) + ); + + + public static TableInfo REF_PARSE_ERROR_TYPES = new TableInfo("ref_parse_error_types", + new ColInfo(Cols.PARSE_ERROR_TYPE_ID, Types.INTEGER), + new ColInfo(Cols.PARSE_ERROR_DESCRIPTION, Types.VARCHAR, 128) + ); + + public static TableInfo REF_PARSE_EXCEPTION_TYPES = new TableInfo("ref_parse_exception_types", + new ColInfo(Cols.PARSE_EXCEPTION_TYPE_ID, Types.INTEGER), + new ColInfo(Cols.PARSE_EXCEPTION_DESCRIPTION, Types.VARCHAR, 128) + ); + + public static final String TRUE = Boolean.toString(true); + public static final String FALSE = Boolean.toString(false); + + + protected static final AtomicInteger CONTAINER_ID = new AtomicInteger(); + protected static final AtomicInteger ID = new AtomicInteger(); + + + private final static String UNKNOWN_EXTENSION = "unk"; + //make this configurable + private final static String DIGEST_KEY = "X-TIKA:digest:MD5"; + + private static CommonTokenCountManager commonTokenCountManager; + private String lastExtractExtension = null; + + final AnalyzerManager analyzerManager; + final TokenCounter tokenCounter; + + public enum EXTRACT_ERROR_TYPE { + //what do you see when you look at the extract file + NO_EXTRACT_FILE, + ZERO_BYTE_EXTRACT_FILE, + EXTRACT_PARSE_EXCEPTION + } + + public enum EXCEPTION_TYPE { + RUNTIME, + ENCRYPTION, + ACCESS_PERMISSION, + UNSUPPORTED_VERSION, + } + + public enum PARSE_ERROR_TYPE { + //what was gathered from the log file during the batch run + OOM, + TIMEOUT + } + + public static TableInfo MIME_TABLE = new TableInfo("mimes", + new ColInfo(Cols.MIME_TYPE_ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.MIME_STRING, Types.VARCHAR, 256), + new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12) + ); + + private static Pattern FILE_NAME_CLEANER = Pattern.compile("\\.(json|txt)(\\.(bz2|gz|zip))?$"); + + + final static int FILE_PATH_MAX_LEN = 512;//max len for varchar for file_path + final static int MAX_STRING_LENGTH = 1000000; + final static int MAX_LEN_FOR_LANG_ID = 20000; + + //these remove runtime info from the stacktraces so + //that actual causes can be counted. + private final static Pattern CAUSED_BY_SNIPPER = + Pattern.compile("(Caused by: [^:]+):[^\\r\\n]+"); + + private final static Pattern ACCESS_PERMISSION_EXCEPTION = + Pattern.compile("org\\.apache\\.tika\\.exception\\.AccessPermissionException"); + private final static Pattern ENCRYPTION_EXCEPTION = + Pattern.compile("org\\.apache\\.tika.exception\\.EncryptedDocumentException"); + + private TikaConfig config = TikaConfig.getDefaultConfig();//TODO: allow configuration + final LanguageIDWrapper langIder; + protected IDBWriter writer; + + public static void loadCommonTokens(Path p) throws IOException { + commonTokenCountManager = new CommonTokenCountManager(p); + } + + public AbstractProfiler(ArrayBlockingQueue<FileResource> fileQueue, + IDBWriter writer) { + super(fileQueue); + this.writer = writer; + langIder = new LanguageIDWrapper(); + try { + analyzerManager = AnalyzerManager.newInstance(); + tokenCounter = new TokenCounter(analyzerManager.getGeneralAnalyzer(), + analyzerManager.getAlphaIdeoAnalyzer()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + protected void writeError(TableInfo extractErrorTable, String containerId, + String filePath, Path extractsA) throws IOException { + Map<Cols, String> data = new HashMap<>(); + data.put(Cols.CONTAINER_ID, containerId); + data.put(Cols.FILE_PATH, filePath); + int errorCode = -1; + long len = -1; + if (extractsA != null) { + try { + len = Files.size(extractsA); + } catch (IOException e) { + //swallow + } + } + if (extractsA == null) { + errorCode = EXTRACT_ERROR_TYPE.NO_EXTRACT_FILE.ordinal(); + } else if (len == 0) { + errorCode = EXTRACT_ERROR_TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal(); + } else { + errorCode = EXTRACT_ERROR_TYPE.EXTRACT_PARSE_EXCEPTION.ordinal(); + } + data.put(Cols.EXTRACT_ERROR_TYPE_ID, Integer.toString(errorCode)); + writer.writeRow(extractErrorTable, data); + + } + + protected void writeProfileData(EvalFilePaths fps, int i, Metadata m, + String fileId, String containerId, + List<Integer> numAttachments, TableInfo profileTable) { + + Map<Cols, String> data = new HashMap<>(); + data.put(Cols.ID, fileId); + data.put(Cols.CONTAINER_ID, containerId); + data.put(Cols.MD5, m.get(DIGEST_KEY)); + + if ( i < numAttachments.size()) { + data.put(Cols.NUM_ATTACHMENTS, Integer.toString(numAttachments.get(i))); + } + data.put(Cols.ELAPSED_TIME_MILLIS, getTime(m)); + data.put(Cols.NUM_METADATA_VALUES, + Integer.toString(countMetadataValues(m))); + + Integer nPages = m.getInt(PagedText.N_PAGES); + if (nPages != null) { + data.put(Cols.NUM_PAGES, Integer.toString(nPages)); + } + + //if the outer wrapper document + if (i == 0) { + + data.put(Cols.IS_EMBEDDED, FALSE); + data.put(Cols.FILE_NAME, fps.getRelativeSourceFilePath().getFileName().toString()); + } else { + data.put(Cols.IS_EMBEDDED, TRUE); + data.put(Cols.FILE_NAME, FilenameUtils.getName(m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH))); + } + data.put(Cols.FILE_EXTENSION, + FilenameUtils.getExtension(fps.getRelativeSourceFilePath().getFileName().toString())); + long srcFileLen = getSourceFileLength(m); + if (srcFileLen > NON_EXISTENT_FILE_LENGTH) { + data.put(Cols.LENGTH, Long.toString(srcFileLen)); + } else { + data.put(Cols.LENGTH, ""); + } + int numMetadataValues = countMetadataValues(m); + data.put(Cols.NUM_METADATA_VALUES, + Integer.toString(numMetadataValues)); + + data.put(Cols.ELAPSED_TIME_MILLIS, + getTime(m)); + + String content = getContent(m, MAX_STRING_LENGTH); + if (content == null || content.trim().length() == 0) { + data.put(Cols.HAS_CONTENT, FALSE); + } else { + data.put(Cols.HAS_CONTENT, TRUE); + } + getFileTypes(m, data); + try { + writer.writeRow(profileTable, data); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + protected void writeExceptionData(String fileId, Metadata m, TableInfo exceptionTable) { + Map<Cols, String> data = new HashMap<>(); + getExceptionStrings(m, data); + if (data.keySet().size() > 0) { + try { + data.put(Cols.ID, fileId); + writer.writeRow(exceptionTable, data); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + /** + * Checks to see if metadata is null or content is empty (null or only whitespace). + * If any of these, then this does no processing, and the fileId is not + * entered into the content table. + * + * @param fileId + * @param m + * @param fieldName + * @param contentsTable + */ + protected void writeContentData(String fileId, Metadata m, + String fieldName, TableInfo contentsTable) throws IOException { + if (m == null) { + return; + } + + String content = getContent(m, MAX_STRING_LENGTH); + if (content == null || content.trim().length() == 0) { + return; + } + tokenCounter.clear(fieldName); + tokenCounter.add(fieldName, content); + + Map<Cols, String> data = new HashMap<>(); + data.put(Cols.ID, fileId); + data.put(Cols.CONTENT_LENGTH, Integer.toString(content.length())); + langid(m, data); + String langid = data.get(Cols.LANG_ID_1); + langid = (langid == null) ? "" : langid; + + writeTokenCounts(data, fieldName, tokenCounter); + CommonTokenResult commonTokenResult = null; + try { + commonTokenResult = commonTokenCountManager.countTokenOverlaps(langid, + tokenCounter.getAlphaTokens(fieldName)); + } catch (IOException e) { + logger.error(e.getMessage(), e); + } + data.put(Cols.COMMON_TOKENS_LANG, commonTokenResult.getLangCode()); + data.put(Cols.NUM_COMMON_TOKENS, Integer.toString(commonTokenResult.getTokens())); + TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(fieldName); + TokenStatistics alphaTokenStatistics = tokenCounter.getAlphaTokenStatistics(fieldName); + data.put(Cols.NUM_UNIQUE_TOKENS, + Integer.toString(tokenStatistics.getTotalUniqueTokens())); + data.put(Cols.NUM_TOKENS, + Integer.toString(tokenStatistics.getTotalTokens())); + data.put(Cols.NUM_ALPHABETIC_TOKENS, + Integer.toString(alphaTokenStatistics.getTotalTokens())); + + data.put(Cols.TOKEN_ENTROPY_RATE, + Double.toString(tokenStatistics.getEntropy())); + SummaryStatistics summStats = tokenStatistics.getSummaryStatistics(); + data.put(Cols.TOKEN_LENGTH_SUM, + Integer.toString((int) summStats.getSum())); + + data.put(Cols.TOKEN_LENGTH_MEAN, + Double.toString(summStats.getMean())); + + data.put(Cols.TOKEN_LENGTH_STD_DEV, + Double.toString(summStats.getStandardDeviation())); + unicodeBlocks(m, data); + try { + writer.writeRow(contentsTable, data); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + String getTime(Metadata m) { + String elapsed = "-1"; + + String v = m.get(RecursiveParserWrapper.PARSE_TIME_MILLIS); + if (v != null) { + return v; + } + return elapsed; + } + + int countMetadataValues(Metadata m) { + if (m == null) { + return 0; + } + int i = 0; + for (String n : m.names()) { + i += m.getValues(n).length; + } + return i; + } + + void getExceptionStrings(Metadata metadata, Map<Cols, String> data) { + + String fullTrace = metadata.get(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "runtime"); + + if (fullTrace == null) { + fullTrace = metadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION); + } + + if (fullTrace != null) { + //check for "expected" exceptions...exceptions + //that can't be fixed. + //Do not store trace for "expected" exceptions + + Matcher matcher = ACCESS_PERMISSION_EXCEPTION.matcher(fullTrace); + if (matcher.find()) { + data.put(Cols.PARSE_EXCEPTION_TYPE_ID, + Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal())); + return; + } + matcher = ENCRYPTION_EXCEPTION.matcher(fullTrace); + if (matcher.find()) { + data.put(Cols.PARSE_EXCEPTION_TYPE_ID, + Integer.toString(EXCEPTION_TYPE.ENCRYPTION.ordinal())); + return; + } + + data.put(Cols.PARSE_EXCEPTION_TYPE_ID, + Integer.toString(EXCEPTION_TYPE.RUNTIME.ordinal())); + + data.put(Cols.ORIG_STACK_TRACE, fullTrace); + //TikaExceptions can have object ids, as in the "@2b1ea6ee" in: + //org.apache.tika.exception.TikaException: TIKA-198: Illegal + //IOException from org.apache.tika.parser.microsoft.OfficeParser@2b1ea6ee + //For reporting purposes, let's snip off the object id so that we can more + //easily count exceptions. + String sortTrace = ExceptionUtils.trimMessage(fullTrace); + + matcher = CAUSED_BY_SNIPPER.matcher(sortTrace); + sortTrace = matcher.replaceAll("$1"); + sortTrace = sortTrace.replaceAll("org.apache.tika.", "o.a.t."); + data.put(Cols.SORT_STACK_TRACE, sortTrace); + } + } + + protected static String getContent(Metadata metadata, int maxLength) { + if (metadata == null) { + return ""; + } + String c = metadata.get(RecursiveParserWrapper.TIKA_CONTENT); + if (c == null) { + return ""; + } + if (c.length() > maxLength) { + c = c.substring(0, maxLength); + } + return c; + } + + void unicodeBlocks(Metadata metadata, Map<Cols, String> data) { + String content = getContent(metadata, MAX_LEN_FOR_LANG_ID); + if (content.length() < 200) { + return; + } + String s = content; + if (content.length() > MAX_LEN_FOR_LANG_ID) { + s = content.substring(0, MAX_LEN_FOR_LANG_ID); + } + Map<String, Integer> m = new HashMap<>(); + Reader r = new StringReader(s); + try { + int c = r.read(); + while (c != -1) { + Character.UnicodeBlock block = Character.UnicodeBlock.of(c); + String blockString = (block == null) ? "NULL" : block.toString(); + Integer i = m.get(blockString); + if (i == null) { + i = 0; + } + i++; + if (block == null) { + blockString = "NULL"; + } + m.put(blockString, i); + c = r.read(); + } + } catch (IOException e) { + e.printStackTrace(); + //swallow + } + + List<Pair<String, Integer>> pairs = new ArrayList<>(); + for (Map.Entry<String, Integer> e : m.entrySet()) { + pairs.add(Pair.of(e.getKey(), e.getValue())); + } + Collections.sort(pairs, new Comparator<Pair<String, Integer>>() { + @Override + public int compare(Pair<String, Integer> o1, Pair<String, Integer> o2) { + return o2.getValue().compareTo(o1.getValue()); + } + }); + StringBuilder sb = new StringBuilder(); + + for (int i = 0; i < 20 && i < pairs.size(); i++) { + if (i > 0) { + sb.append(" | "); + } + sb.append(pairs.get(i).getKey()+": "+pairs.get(i).getValue()); + } + data.put(Cols.UNICODE_CHAR_BLOCKS, sb.toString()); + } + + void langid(Metadata metadata, Map<Cols, String> data) { + String content = getContent(metadata, MAX_LEN_FOR_LANG_ID); + if (content.length() < 50) { + return; + } + String s = content; + if (content.length() > MAX_LEN_FOR_LANG_ID) { + s = content.substring(0, MAX_LEN_FOR_LANG_ID); + } + List<DetectedLanguage> probabilities = langIder.getProbabilities(s); + if (probabilities.size() > 0) { + data.put(Cols.LANG_ID_1, probabilities.get(0).getLocale().getLanguage()); + data.put(Cols.LANG_ID_PROB_1, + Double.toString(probabilities.get(0).getProbability())); + } + if (probabilities.size() > 1) { + data.put(Cols.LANG_ID_2, probabilities.get(1).getLocale().getLanguage()); + data.put(Cols.LANG_ID_PROB_2, + Double.toString(probabilities.get(1).getProbability())); + } + + } + + void getFileTypes(Metadata metadata, Map<Cols, String> output) { + if (metadata == null) { + return; + } + String type = metadata.get(Metadata.CONTENT_TYPE); + if (type == null) { + return; + } + int mimeId = writer.getMimeId(type); + output.put(Cols.MIME_TYPE_ID, Integer.toString(mimeId)); + } + + void writeTokenCounts(Map<Cols, String> data, String field, + TokenCounter tokenCounter) { + + + int stops = 0; + int i = 0; + StringBuilder sb = new StringBuilder(); + TokenStatistics tokenStatistics = tokenCounter.getTokenStatistics(field); + for (TokenIntPair t : tokenStatistics.getTopN()) { + if (i++ > 0) { + sb.append(" | "); + } + sb.append(t.getToken() + ": " + t.getValue()); + } + + data.put(Cols.TOP_N_TOKENS, sb.toString()); + } + + + public void closeWriter() throws IOException { + writer.close(); + } + + + /** + * + * @param metadata + * @param extractDir + * @return evalfilepaths for files if crawling an extract directory + */ + protected EvalFilePaths getPathsFromExtractCrawl(Metadata metadata, + Path extractDir) { + String relExtractFilePath = metadata.get(FSProperties.FS_REL_PATH); + Matcher m = FILE_NAME_CLEANER.matcher(relExtractFilePath); + Path relativeSourceFilePath = Paths.get(m.replaceAll("")); + //just try slapping the relextractfilepath on the extractdir + Path extractFile = extractDir.resolve(relExtractFilePath); + if (! Files.isRegularFile(extractFile)) { + //if that doesn't work, try to find the right extract file. + //This is necessary if crawling extractsA and trying to find a file in + //extractsB that is not in the same format: json vs txt or compressed + extractFile = findFile(extractDir, relativeSourceFilePath); + } + return new EvalFilePaths(relativeSourceFilePath, extractFile); + } + //call this if the crawler is crawling through the src directory + protected EvalFilePaths getPathsFromSrcCrawl(Metadata metadata, Path srcDir, + Path extractDir) { + Path relativeSourceFilePath = Paths.get(metadata.get(FSProperties.FS_REL_PATH)); + Path extractFile = findFile(extractDir, relativeSourceFilePath); + Path inputFile = srcDir.resolve(relativeSourceFilePath); + long srcLen = -1l; + //try to get the length of the source file in case there was an error + //in both extracts + try { + srcLen = Files.size(inputFile); + } catch (IOException e) { + logger.warn("Couldn't get length for: "+inputFile.toAbsolutePath()); + } + return new EvalFilePaths(relativeSourceFilePath, extractFile, srcLen); + } + + /** + * + * @param extractRootDir + * @param relativeSourceFilePath + * @return extractFile or null if couldn't find one. + */ + private Path findFile(Path extractRootDir, Path relativeSourceFilePath) { + String relSrcFilePathString = relativeSourceFilePath.toString(); + if (lastExtractExtension != null) { + Path candidate = extractRootDir.resolve(relSrcFilePathString+lastExtractExtension); + if (Files.isRegularFile(candidate)) { + return candidate; + } + } + for (String ext : EXTRACT_EXTENSIONS) { + for (String compress : COMPRESSION_EXTENSIONS) { + Path candidate = extractRootDir.resolve(relSrcFilePathString+ext+compress); + if (Files.isRegularFile(candidate)) { + lastExtractExtension = ext+compress; + return candidate; + } + } + } + return null; + } + + protected long getSourceFileLength(EvalFilePaths fps, List<Metadata> metadataList) { + if (fps.getSourceFileLength() > NON_EXISTENT_FILE_LENGTH) { + return fps.getSourceFileLength(); + } + return getSourceFileLength(metadataList); + } + + long getSourceFileLength(List<Metadata> metadataList) { + if (metadataList == null || metadataList.size() < 1) { + return NON_EXISTENT_FILE_LENGTH; + } + return getSourceFileLength(metadataList.get(0)); + } + + long getSourceFileLength(Metadata m) { + String lenString = m.get(Metadata.CONTENT_LENGTH); + if (lenString == null) { + return NON_EXISTENT_FILE_LENGTH; + } + try { + return Long.parseLong(lenString); + } catch (NumberFormatException e) { + //swallow + } + return NON_EXISTENT_FILE_LENGTH; + } + + protected long getFileLength(Path p) { + if (p != null && Files.isRegularFile(p)) { + try { + return Files.size(p); + } catch (IOException e) { + //swallow + } + } + return NON_EXISTENT_FILE_LENGTH; + } + + /** + * + * @param list + * @return empty list if input list is empty or null + */ + static List<Integer> countAttachments(List<Metadata> list) { + List<Integer> ret = new ArrayList<>(); + if (list == null || list.size() == 0) { + return ret; + } + //container document attachment count = list.size()-1 + ret.add(list.size()-1); + + Map<String, Integer> counts = new HashMap<>(); + for (int i = 1; i < list.size(); i++) { + String path = list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + if (path == null) { + //shouldn't ever happen + continue; + } + String[] parts = path.split("/"); + StringBuilder parent = new StringBuilder(); + for (int end = 1; end < parts.length-1; end++) { + parent.setLength(0); + join("/", parent, parts, 1, end); + String parentPath = parent.toString(); + Integer count = counts.get(parentPath); + if (count == null) { + count = 1; + } else { + count++; + } + counts.put(parentPath, count); + } + } + + for (int i = 1; i < list.size(); i++) { + Integer count = counts.get(list.get(i).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); + if (count == null) { + count = 0; + } + ret.add(i, count); + } + return ret; + + + } + + private static void join(String delimiter, StringBuilder sb, String[] parts, int start, int end) { + for (int i = start; i <= end; i++) { + sb.append(delimiter); + sb.append(parts[i]); + } + } +} + http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java b/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java new file mode 100644 index 0000000..1a3d29c --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval; + +import static org.apache.tika.eval.AbstractProfiler.NON_EXISTENT_FILE_LENGTH; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +/** + * Simple struct to keep track of relative path of source file ( + * original binary file, e.g. /subdir/document1.doc) + * and the extract file (e.g. /subdir/document1.doc.json). + */ +class EvalFilePaths { + + private final Path relativeSourceFilePath; + private final Path extractFile; + + private long sourceFileLength = NON_EXISTENT_FILE_LENGTH; + private long extractFileLength = NON_EXISTENT_FILE_LENGTH; + + + public EvalFilePaths(Path relativeSourceFilePath, Path extractFile, long srcFileLen) { + this(relativeSourceFilePath, extractFile); + this.sourceFileLength = srcFileLen; + } + + public EvalFilePaths(Path relativeSourceFilePath, Path extractFile) { + if (extractFile != null && Files.isRegularFile(extractFile)) { + try { + extractFileLength = Files.size(extractFile); + } catch (IOException e) { + //swallow ? + } + } + this.relativeSourceFilePath = relativeSourceFilePath; + this.extractFile = extractFile; + } + + public Path getRelativeSourceFilePath() { + return relativeSourceFilePath; + } + + //this path may or may not exist and it could be null! + public Path getExtractFile() { + return extractFile; + } + + //if it doesn't exist, it'll be -1l. + public long getSourceFileLength() { + return sourceFileLength; + } + + public long getExtractFileLength() { + return extractFileLength; + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + EvalFilePaths that = (EvalFilePaths) o; + + if (sourceFileLength != that.sourceFileLength) return false; + if (extractFileLength != that.extractFileLength) return false; + if (relativeSourceFilePath != null ? !relativeSourceFilePath.equals(that.relativeSourceFilePath) : that.relativeSourceFilePath != null) + return false; + return extractFile != null ? extractFile.equals(that.extractFile) : that.extractFile == null; + + } + + @Override + public int hashCode() { + int result = relativeSourceFilePath != null ? relativeSourceFilePath.hashCode() : 0; + result = 31 * result + (extractFile != null ? extractFile.hashCode() : 0); + result = 31 * result + (int) (sourceFileLength ^ (sourceFileLength >>> 32)); + result = 31 * result + (int) (extractFileLength ^ (extractFileLength >>> 32)); + return result; + } + + @Override + public String toString() { + return "EvalFilePaths{" + + "relativeSourceFilePath=" + relativeSourceFilePath + + ", extractFile=" + extractFile + + ", sourceFileLength=" + sourceFileLength + + ", extractFileLength=" + extractFileLength + + '}'; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java new file mode 100644 index 0000000..8b3d266 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java @@ -0,0 +1,455 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval; + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.Types; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ArrayBlockingQueue; + +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.io.FilenameUtils; +import org.apache.tika.batch.FileResource; +import org.apache.tika.batch.fs.FSProperties; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.eval.db.ColInfo; +import org.apache.tika.eval.db.Cols; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.eval.io.ExtractReader; +import org.apache.tika.eval.io.IDBWriter; +import org.apache.tika.eval.tokens.ContrastStatistics; +import org.apache.tika.eval.tokens.TokenContraster; +import org.apache.tika.eval.tokens.TokenIntPair; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.RecursiveParserWrapper; + +public class ExtractComparer extends AbstractProfiler { + + static Options OPTIONS; + static { + Option extractsA = new Option("extractsA", true, "directory for extractsA files"); + extractsA.setRequired(true); + + Option extractsB = new Option("extractsB", true, "directory for extractsB files"); + extractsB.setRequired(true); + + Option db = new Option("db", true, "db file to which to write results"); + db.setRequired(true); + + Option inputDir = new Option("inputDir", true, + "optional: directory of original binary input files if it exists " + + "or can be the same as -extractsA or -extractsB. If not specified, -inputDir=-extractsA"); + inputDir.setRequired(true); + + + OPTIONS = new Options() + .addOption(extractsA) + .addOption(extractsB) + .addOption(db) + .addOption(inputDir) + .addOption("bc", "optional: tika-batch config file") + .addOption("numConsumers", true, "optional: number of consumer threads") + .addOption(new Option("alterExtract", true, + "for json-formatted extract files, " + + "process full metadata list ('as_is'=default), " + + "take just the first/container document ('first_only'), " + + "concatenate all content into the first metadata item ('concatenate_content')") + ); + } + + public static void USAGE() { + HelpFormatter helpFormatter = new HelpFormatter(); + helpFormatter.printHelp( + 80, + "java -jar tika-eval-x.y.jar Compare -extractsA extractsA -extractsB extractsB -db mydb", + "Tool: Compare", + ExtractComparer.OPTIONS, + "Note: for h2 db, do not include the .mv.db at the end of the db name."); + } + + private final static String FIELD_A = "fa"; + private final static String FIELD_B = "fb"; + + public static TableInfo REF_PAIR_NAMES = new TableInfo("pair_names", + new ColInfo(Cols.DIR_NAME_A, Types.VARCHAR, 128), + new ColInfo(Cols.DIR_NAME_B, Types.VARCHAR, 128) + ); + + public static TableInfo COMPARISON_CONTAINERS = new TableInfo("containers", + new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN), + new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12), + new ColInfo(Cols.LENGTH, Types.BIGINT), + new ColInfo(Cols.EXTRACT_FILE_LENGTH_A, Types.BIGINT), + new ColInfo(Cols.EXTRACT_FILE_LENGTH_B, Types.BIGINT) + ); + + public static TableInfo CONTENT_COMPARISONS = new TableInfo("content_comparisons", + new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, Types.VARCHAR, 1024), + new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, Types.VARCHAR, 1024), + new ColInfo(Cols.TOP_10_MORE_IN_A, Types.VARCHAR, 1024), + new ColInfo(Cols.TOP_10_MORE_IN_B, Types.VARCHAR, 1024), + new ColInfo(Cols.DICE_COEFFICIENT, Types.FLOAT), + new ColInfo(Cols.OVERLAP, Types.FLOAT) + ); + + public static TableInfo PROFILES_A = new TableInfo( "profiles_a", + ExtractProfiler.PROFILE_TABLE.getColInfos()); + + public static TableInfo PROFILES_B = new TableInfo( "profiles_b", + ExtractProfiler.PROFILE_TABLE.getColInfos()); + + public static TableInfo EMBEDDED_FILE_PATH_TABLE_A = new TableInfo( "emb_path_a", + ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos()); + + public static TableInfo EMBEDDED_FILE_PATH_TABLE_B = new TableInfo( "emb_path_b", + ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos()); + + + public static TableInfo CONTENTS_TABLE_A = new TableInfo( "contents_a", + ExtractProfiler.CONTENTS_TABLE.getColInfos()); + + public static TableInfo CONTENTS_TABLE_B = new TableInfo( "contents_b", + ExtractProfiler.CONTENTS_TABLE.getColInfos()); + + public static TableInfo EXCEPTION_TABLE_A = new TableInfo ("exceptions_a", + ExtractProfiler.EXCEPTION_TABLE.getColInfos()); + + public static TableInfo EXCEPTION_TABLE_B = new TableInfo ("exceptions_b", + ExtractProfiler.EXCEPTION_TABLE.getColInfos()); + + public static TableInfo ERROR_TABLE_A = new TableInfo("extract_errors_a", + ExtractProfiler.ERROR_TABLE.getColInfos()); + public static TableInfo ERROR_TABLE_B = new TableInfo("extract_errors_b", + ExtractProfiler.ERROR_TABLE.getColInfos()); + + + //need to parameterize? + private final TikaConfig config = TikaConfig.getDefaultConfig(); + + private final Path inputDir; + private final Path extractsA; + private final Path extractsB; + + private final long minJsonLength; + private final long maxJsonLength; + private final ExtractReader.ALTER_METADATA_LIST alterExtractList; + + private final TokenContraster tokenContraster = new TokenContraster(); + private final ExtractReader extractReader = new ExtractReader(); + + public ExtractComparer(ArrayBlockingQueue<FileResource> queue, + Path inputDir, Path extractsA, Path extractsB, + IDBWriter writer, long minJsonLength, + long maxJsonLength, ExtractReader.ALTER_METADATA_LIST alterExtractList) { + super(queue, writer); + this.minJsonLength = minJsonLength; + this.maxJsonLength = maxJsonLength; + this.inputDir = inputDir; + this.extractsA = extractsA; + this.extractsB = extractsB; + this.alterExtractList = alterExtractList; + } + + @Override + public boolean processFileResource(FileResource fileResource) { + Metadata metadata = fileResource.getMetadata(); + EvalFilePaths fpsA = null; + EvalFilePaths fpsB = null; + + if (inputDir != null && (inputDir.equals(extractsA) || + inputDir.equals(extractsB))) { + //crawling an extract dir + fpsA = getPathsFromExtractCrawl(metadata, extractsA); + fpsB = getPathsFromExtractCrawl(metadata, extractsB); + + } else { + fpsA = getPathsFromSrcCrawl(metadata, inputDir, extractsA); + fpsB = getPathsFromSrcCrawl(metadata, inputDir, extractsB); + } + + if (minJsonLength > -1) { + //if both files exist and are < minJsonLength, skip em + if (fpsA.getExtractFileLength() > NON_EXISTENT_FILE_LENGTH + && fpsA.getExtractFileLength() < minJsonLength + && fpsB.getExtractFileLength() > NON_EXISTENT_FILE_LENGTH + && fpsB.getExtractFileLength() < minJsonLength) { + return false; + } + } + if (maxJsonLength > -1) { + if ((fpsA.getExtractFileLength() > maxJsonLength) || + (fpsB.getExtractFileLength() > maxJsonLength)) { + return false; + } + } + + + try { + compareFiles(fpsA, fpsB); + } catch (Throwable e) { + e.printStackTrace(); + //this should be cataclysmic... + throw new RuntimeException("Exception while working on: " + + metadata.get(FSProperties.FS_REL_PATH), e); + } + return true; + } + + //protected for testing, should find better way so that this can be private! + protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOException { + + List<Metadata> metadataListA = + extractReader.loadExtract(fpsA.getExtractFile(), alterExtractList); + List<Metadata> metadataListB = + extractReader.loadExtract(fpsB.getExtractFile(), alterExtractList); + + //array indices for those metadata items handled in + //"that" + Set<Integer> handledB = new HashSet<>(); + String containerID = Integer.toString(CONTAINER_ID.getAndIncrement()); + //container table + Map<Cols, String> contData = new HashMap<>(); + contData.put(Cols.CONTAINER_ID, containerID); + contData.put(Cols.FILE_PATH, fpsA.getRelativeSourceFilePath().toString()); + long srcFileLength = getSourceFileLength(metadataListA, metadataListB); + contData.put(Cols.LENGTH, + srcFileLength > NON_EXISTENT_FILE_LENGTH ? + Long.toString(srcFileLength) : ""); + contData.put(Cols.FILE_EXTENSION, + FilenameUtils.getExtension(fpsA.getRelativeSourceFilePath().getFileName().toString())); + + long extractFileLengthA = getFileLength(fpsA.getExtractFile()); + contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > NON_EXISTENT_FILE_LENGTH ? + Long.toString(extractFileLengthA) : ""); + + long extractFileLengthB = getFileLength(fpsA.getExtractFile()); + contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > NON_EXISTENT_FILE_LENGTH ? + Long.toString(extractFileLengthB) : ""); + + writer.writeRow(COMPARISON_CONTAINERS, contData); + + if (metadataListA == null) { + writeError(ERROR_TABLE_A, containerID, fpsA.getRelativeSourceFilePath().toString(), + fpsA.getExtractFile()); + } + if (metadataListB == null) { + writeError(ERROR_TABLE_B, containerID, fpsB.getRelativeSourceFilePath().toString(), + fpsB.getExtractFile()); + } + + if (metadataListA == null && metadataListB == null) { + return; + } + List<Integer> numAttachmentsA = countAttachments(metadataListA); + List<Integer> numAttachmentsB = countAttachments(metadataListB); + + //now get that metadata + if (metadataListA != null) { + for (int i = 0; i < metadataListA.size(); i++) { + String fileId = Integer.toString(ID.getAndIncrement()); + Metadata metadataA = metadataListA.get(i); + Metadata metadataB = null; + //TODO: shouldn't be fileA!!!! + writeProfileData(fpsA, i, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A); + writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A); + int matchIndex = getMatch(i, metadataListA, metadataListB); + + if (matchIndex > -1) { + metadataB = metadataListB.get(matchIndex); + handledB.add(matchIndex); + } + if (metadataB != null) { + writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B); + writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B); + } + writeEmbeddedFilePathData(i, fileId, metadataA, metadataB); + //prep the token counting + tokenCounter.clear(FIELD_A); + tokenCounter.clear(FIELD_B); + //write content + try { + writeContentData(fileId, metadataA, FIELD_A, CONTENTS_TABLE_A); + writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B); + } catch (IOException e) { + throw new RuntimeException(e); + } + + //now run comparisons + if (tokenCounter.getTokenStatistics(FIELD_A).getTotalTokens() > 0 + && tokenCounter.getTokenStatistics(FIELD_B).getTotalTokens() > 0) { + Map<Cols, String> data = new HashMap<>(); + data.put(Cols.ID, fileId); + + ContrastStatistics contrastStatistics = + tokenContraster.calculateContrastStatistics( + tokenCounter.getTokens(FIELD_A), + tokenCounter.getTokenStatistics(FIELD_A), + tokenCounter.getTokens(FIELD_B), + tokenCounter.getTokenStatistics(FIELD_B)); + + writeContrasts(data, contrastStatistics); + writer.writeRow(CONTENT_COMPARISONS, data); + } + } + } + //now try to get any Metadata objects in "that" + //that haven't yet been handled. + if (metadataListB != null) { + for (int i = 0; i < metadataListB.size(); i++) { + if (handledB.contains(i)) { + continue; + } + Metadata metadataB = metadataListB.get(i); + String fileId = Integer.toString(ID.getAndIncrement()); + writeProfileData(fpsB, i, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B); + writeEmbeddedFilePathData(i, fileId, null, metadataB); + writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B); + + //prep the token counting + tokenCounter.clear(FIELD_B); + //write content + try { + writeContentData(fileId, metadataB, FIELD_B, CONTENTS_TABLE_B); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + } + + private void writeEmbeddedFilePathData(int i, String fileId, Metadata mA, Metadata mB) { + //container file, don't write anything + if (i == 0) { + return; + } + String pathA = null; + String pathB = null; + if (mA != null) { + pathA = mA.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + } + if (mB != null) { + pathB = mB.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + } + if (pathA != null) { + Map<Cols, String> d = new HashMap<>(); + d.put(Cols.ID, fileId); + d.put(Cols.EMBEDDED_FILE_PATH, pathA); + try { + writer.writeRow(EMBEDDED_FILE_PATH_TABLE_A, d); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + if (pathB != null && + (pathA == null || ! pathA.equals(pathB))) { + Map<Cols, String> d = new HashMap<>(); + d.put(Cols.ID, fileId); + d.put(Cols.EMBEDDED_FILE_PATH, pathB); + try { + writer.writeRow(EMBEDDED_FILE_PATH_TABLE_B, d); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + + private long getSourceFileLength(List<Metadata> metadataListA, List<Metadata> metadataListB) { + long len = getSourceFileLength(metadataListA); + if (len > NON_EXISTENT_FILE_LENGTH) { + return len; + } + return getSourceFileLength(metadataListB); + } + + + /** + * Try to find the matching metadata based on the RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH + * If you can't find it, return -1; + * + * @param i index for match in metadataListA + * @param metadataListA + * @param metadataListB + * @return + */ + private int getMatch(int i, + List<Metadata> metadataListA, + List<Metadata> metadataListB) { + //TODO: could make this more robust + if (metadataListB == null || metadataListB.size() == 0) { + return -1; + } + if (i == 0) { + return 0; + } + if (metadataListA.size() == metadataListB.size()) { + //assume no rearrangments if lists are the same size + return i; + } + + Metadata thisMetadata = metadataListA.get(i); + String embeddedPath = thisMetadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + if (embeddedPath == null) { + return -1; + } + if (i < metadataListB.size()) { + } + + for (int j = 0; j < metadataListB.size(); j++) { + String thatEmbeddedPath = metadataListB.get(j).get( + RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH); + if (embeddedPath.equals(thatEmbeddedPath)) { + return j; + } + } + return -1; + } + + + + + private void writeContrasts(Map<Cols, String> data, ContrastStatistics contrastStatistics) { + writeContrastString(data, Cols.TOP_10_MORE_IN_A, contrastStatistics.getTopNMoreA()); + writeContrastString(data, Cols.TOP_10_MORE_IN_B, contrastStatistics.getTopNMoreB()); + writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, contrastStatistics.getTopNUniqueA()); + writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, contrastStatistics.getTopNUniqueB()); + data.put(Cols.OVERLAP, Double.toString(contrastStatistics.getOverlap())); + data.put(Cols.DICE_COEFFICIENT, Double.toString(contrastStatistics.getDiceCoefficient())); + + } + + private void writeContrastString(Map<Cols, String> data, Cols col, TokenIntPair[] tokenIntPairs) { + + int i = 0; + StringBuilder sb = new StringBuilder(); + for (TokenIntPair p : tokenIntPairs) { + if (i++ > 0) { + sb.append(" | "); + } + sb.append(p.getToken()).append(": ").append(p.getValue()); + } + data.put(col, sb.toString()); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java new file mode 100644 index 0000000..6840926 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java @@ -0,0 +1,238 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval; + + +import java.io.IOException; +import java.nio.file.Path; +import java.sql.Types; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.ArrayBlockingQueue; + +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.tika.batch.FileResource; +import org.apache.tika.eval.db.ColInfo; +import org.apache.tika.eval.db.Cols; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.eval.io.ExtractReader; +import org.apache.tika.eval.io.IDBWriter; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.RecursiveParserWrapper; + +public class ExtractProfiler extends AbstractProfiler { + + static Options OPTIONS; + static { + Option db = new Option("db", true, "db file to which to write results"); + db.setRequired(true); + + //By the time this commandline is parsed, there should be both an extractDir and an inputDir + Option extractDir = new Option("extractDir", true, "directory for extract files"); + extractDir.setRequired(true); + + Option inputDir = new Option("inputDir", true, + "optional: directory for original binary input documents."+ + " If not specified, -extractDir is crawled as is."); + inputDir.setRequired(true); + + OPTIONS = new Options() + .addOption(db) + .addOption(extractDir) + .addOption(inputDir) + .addOption("bc", "optional: tika-batch config file") + .addOption("numConsumers", true, "optional: number of consumer threads") + .addOption(new Option("alterExtract", true, + "for json-formatted extract files, " + + "process full metadata list ('as_is'=default), " + + "take just the first/container document ('first_only'), " + + "concatenate all content into the first metadata item ('concatenate_content')")); + + } + + public static void USAGE() { + HelpFormatter helpFormatter = new HelpFormatter(); + helpFormatter.printHelp( + 80, + "java -jar tika-eval-x.y.jar Profile -extractDir extracts -db mydb [-inputDir input]", + "Tool: Profile", + ExtractProfiler.OPTIONS, + "Note: for h2 db, do not include the .mv.db at the end of the db name."); + } + + private final static String FIELD = "f"; + + public static TableInfo ERROR_TABLE = new TableInfo("errors", + new ColInfo(Cols.CONTAINER_ID, Types.INTEGER), + new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN), + new ColInfo(Cols.EXTRACT_ERROR_TYPE_ID, Types.INTEGER), + new ColInfo(Cols.PARSE_ERROR_TYPE_ID, Types.INTEGER) + ); + + public static TableInfo EXCEPTION_TABLE = new TableInfo("parse_exceptions", + new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.ORIG_STACK_TRACE, Types.VARCHAR, 8192), + new ColInfo(Cols.SORT_STACK_TRACE, Types.VARCHAR, 8192), + new ColInfo(Cols.PARSE_EXCEPTION_TYPE_ID, Types.INTEGER) + ); + + + public static TableInfo CONTAINER_TABLE = new TableInfo("containers", + new ColInfo(Cols.CONTAINER_ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.FILE_PATH, Types.VARCHAR, FILE_PATH_MAX_LEN), + new ColInfo(Cols.LENGTH, Types.BIGINT), + new ColInfo(Cols.EXTRACT_FILE_LENGTH, Types.BIGINT) + ); + + public static TableInfo PROFILE_TABLE = new TableInfo("profiles", + new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.CONTAINER_ID, Types.INTEGER), + new ColInfo(Cols.FILE_NAME, Types.VARCHAR, 256), + new ColInfo(Cols.MD5, Types.CHAR, 32), + new ColInfo(Cols.LENGTH, Types.BIGINT), + new ColInfo(Cols.IS_EMBEDDED, Types.BOOLEAN), + new ColInfo(Cols.FILE_EXTENSION, Types.VARCHAR, 12), + new ColInfo(Cols.MIME_TYPE_ID, Types.INTEGER), + new ColInfo(Cols.ELAPSED_TIME_MILLIS, Types.INTEGER), + new ColInfo(Cols.NUM_ATTACHMENTS, Types.INTEGER), + new ColInfo(Cols.NUM_METADATA_VALUES, Types.INTEGER), + new ColInfo(Cols.NUM_PAGES, Types.INTEGER), + new ColInfo(Cols.HAS_CONTENT, Types.BOOLEAN) + ); + + public static TableInfo EMBEDDED_FILE_PATH_TABLE = new TableInfo("emb_file_names", + new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.EMBEDDED_FILE_PATH, Types.VARCHAR, 1024) + ); + + public static TableInfo CONTENTS_TABLE = new TableInfo("contents", + new ColInfo(Cols.ID, Types.INTEGER, "PRIMARY KEY"), + new ColInfo(Cols.CONTENT_LENGTH, Types.INTEGER), + new ColInfo(Cols.NUM_TOKENS, Types.INTEGER), + new ColInfo(Cols.NUM_UNIQUE_TOKENS, Types.INTEGER), + new ColInfo(Cols.COMMON_TOKENS_LANG, Types.VARCHAR, 12), + new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER), + new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024), + new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER), + new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12), + new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT), + new ColInfo(Cols.LANG_ID_2, Types.VARCHAR, 12), + new ColInfo(Cols.LANG_ID_PROB_2, Types.FLOAT), + new ColInfo(Cols.UNICODE_CHAR_BLOCKS, Types.VARCHAR, 1024), + new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT), + new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER), + new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT), + new ColInfo(Cols.TOKEN_LENGTH_STD_DEV, Types.FLOAT) + ); + + private final Path inputDir; + private final Path extractDir; + private final ExtractReader.ALTER_METADATA_LIST alterExtractList; + private final ExtractReader extractReader = new ExtractReader(); + + public ExtractProfiler(ArrayBlockingQueue<FileResource> queue, + Path inputDir, Path extractDir, + IDBWriter dbWriter, ExtractReader.ALTER_METADATA_LIST alterExtractList) { + super(queue, dbWriter); + this.inputDir = inputDir; + this.extractDir = extractDir; + this.alterExtractList = alterExtractList; + } + + @Override + public boolean processFileResource(FileResource fileResource) { + Metadata metadata = fileResource.getMetadata(); + EvalFilePaths fps = null; + + if (inputDir != null && inputDir.equals(extractDir)) { + //crawling an extract dir + fps = getPathsFromExtractCrawl(metadata, extractDir); + } else { + fps = getPathsFromSrcCrawl(metadata, inputDir, extractDir); + } + List<Metadata> metadataList = extractReader.loadExtract(fps.getExtractFile(), alterExtractList); + + Map<Cols, String> contOutput = new HashMap<>(); + String containerId = Integer.toString(CONTAINER_ID.incrementAndGet()); + Long srcFileLen = getSourceFileLength(fps, metadataList); + contOutput.put(Cols.LENGTH, + srcFileLen > NON_EXISTENT_FILE_LENGTH ? + Long.toString(srcFileLen): ""); + contOutput.put(Cols.CONTAINER_ID, containerId); + contOutput.put(Cols.FILE_PATH, fps.getRelativeSourceFilePath().toString()); + + if (fps.getExtractFileLength() > 0) { + contOutput.put(Cols.EXTRACT_FILE_LENGTH, + (fps.getExtractFile() == null) ? + "" : + Long.toString(fps.getExtractFileLength())); + } + try { + writer.writeRow(CONTAINER_TABLE, contOutput); + } catch (IOException e) { + throw new RuntimeException(e); + } + + + if (metadataList == null) { + try { + writeError(ERROR_TABLE, containerId, + fps.getRelativeSourceFilePath().toString(), fps.getExtractFile()); + } catch (IOException e) { + throw new RuntimeException(e); + } + return true; + } + + //TODO: calculate num_attachments, add to profile table + + List<Integer> numAttachments = countAttachments(metadataList); + int i = 0; + for (Metadata m : metadataList) { + String fileId = Integer.toString(ID.incrementAndGet()); + writeProfileData(fps, i, m, fileId, containerId, numAttachments, PROFILE_TABLE); + writeEmbeddedPathData(i, fileId, m, EMBEDDED_FILE_PATH_TABLE); + writeExceptionData(fileId, m, EXCEPTION_TABLE); + try { + writeContentData(fileId, m, FIELD, CONTENTS_TABLE); + } catch (IOException e) { + throw new RuntimeException(e); + } + i++; + } + return true; + } + + private void writeEmbeddedPathData(int i, String fileId, Metadata m, + TableInfo embeddedFilePathTable) { + if (i == 0) { + return; + } + Map<Cols, String> data = new HashMap<>(); + data.put(Cols.ID, fileId); + data.put(Cols.EMBEDDED_FILE_PATH, + m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)); + try { + writer.writeRow(embeddedFilePathTable, data); + } catch (IOException e) { + throw new RuntimeException(e); + } + } +}
