http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java new file mode 100644 index 0000000..0d925cf --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java @@ -0,0 +1,411 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.nio.file.FileSystems; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.Statement; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.commons.io.FileUtils; +import org.apache.tika.batch.fs.FSBatchTestBase; +import org.apache.tika.eval.db.Cols; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; + +@Ignore("need to fix tika-batch tests to make this work") +public class ComparerBatchTest extends FSBatchTestBase { + + public final static String COMPARER_PROCESS_CLASS = "org.apache.tika.batch.fs.FSBatchProcessCLI"; + + private static Path dbDir; + private static Connection conn; + + private final static String compJoinCont = ""; + /*ExtractComparer.COMPARISONS_TABLE+" cmp " + + "join "+ExtractComparer.CONTAINERS_TABLE + " cnt "+ + "on cmp."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID+ + " = cnt."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID;*/ + + @BeforeClass + public static void setUp() throws Exception { + + File inputRoot = new File(ComparerBatchTest.class.getResource("/test-dirs").toURI()); + dbDir = Files.createTempDirectory(inputRoot.toPath(), "tika-test-db-dir-"); + Map<String, String> args = new HashMap<>(); + Path db = FileSystems.getDefault().getPath(dbDir.toString(), "comparisons_test"); + args.put("-db", db.toString()); + + //for debugging, you can use this to select only one file pair to load + //args.put("-includeFilePat", "file8.*"); +/* + BatchProcessTestExecutor ex = new BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args, + "/tika-batch-comparison-eval-config.xml"); + StreamStrings streamStrings = ex.execute(); + System.out.println(streamStrings.getErrString()); + System.out.println(streamStrings.getOutString()); + H2Util dbUtil = new H2Util(db); + conn = dbUtil.getConnection();*/ + } + + @AfterClass + public static void tearDown() throws Exception { + + conn.close(); + + FileUtils.deleteDirectory(dbDir.toFile()); + } + + + @Test + public void testSimpleDBWriteAndRead() throws Exception { + Set<String> set = new HashSet<>(); + //filenames + List<String> list = getColStrings(Cols.FILE_NAME.name(), + ExtractComparer.PROFILES_A.getName(), ""); + assertEquals(7, list.size()); + assertTrue(list.contains("file1.pdf")); + + //container ids in comparisons table + list = getColStrings(Cols.CONTAINER_ID.name(), + ExtractComparer.COMPARISON_CONTAINERS.getName(),""); + assertEquals(10, list.size()); + set.clear(); set.addAll(list); + assertEquals(10, set.size()); +/* + //ids in comparisons table + list = getColStrings(AbstractProfiler.HEADERS.ID.name(), + compTable,""); + assertEquals(9, list.size()); + set.clear(); set.addAll(list); + assertEquals(9, set.size());*/ + } + + + + /* + @Test + public void testFile1PDFRow() throws Exception { + String where = fp+"='file1.pdf'"; + Map<String, String> data = getRow(compJoinCont, where); + String result = data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS + "_A"); + assertTrue(result.startsWith("over: 1")); + + result = data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS + "_B"); + assertTrue(result.startsWith("aardvark: 3 | bear: 2")); + + + assertEquals("aardvark: 3 | bear: 2", + data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.toString())); + assertEquals("fox: 2 | lazy: 1 | over: 1", + data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.toString())); + assertEquals("12", data.get(ExtractComparer.HEADERS.NUM_TOKENS+"_A")); + assertEquals("13", data.get(ExtractComparer.HEADERS.NUM_TOKENS+"_B")); + assertEquals("8", data.get(ExtractComparer.HEADERS.NUM_UNIQUE_TOKENS+"_A")); + assertEquals("9", data.get(ExtractComparer.HEADERS.NUM_UNIQUE_TOKENS+"_B")); + + assertEquals(ExtractComparer.COMPARISON_HEADERS.OVERLAP.name(), + 0.64f, Float.parseFloat(data.get("OVERLAP")), 0.0001f); + + assertEquals(ExtractComparer.COMPARISON_HEADERS.DICE_COEFFICIENT.name(), + 0.8235294f, Float.parseFloat(data.get("DICE_COEFFICIENT")), 0.0001f); + + assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_A", 3.83333d, + Double.parseDouble( + data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_A")), 0.0001d); + + assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_B", 4.923d, + Double.parseDouble( + data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_B")), 0.0001d); + + assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_A", 1.0298d, + Double.parseDouble( + data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_A")), 0.0001d); + + assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_B", 1.9774d, + Double.parseDouble(data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_B")), 0.0001d); + + assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_A", 46, + Integer.parseInt( + data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_A"))); + + assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_B", 64, + Integer.parseInt(data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_B"))); + + assertEquals("TOKEN_ENTROPY_RATE_A", 0.237949, + Double.parseDouble(data.get("TOKEN_ENTROPY_RATE_A")), 0.0001d); + + assertEquals("TOKEN_ENTROPY_RATE_B", 0.232845, + Double.parseDouble(data.get("TOKEN_ENTROPY_RATE_B")), 0.0001d); + + } + + + @Test + public void testEmpty() throws Exception { + String where = fp+"='file4_emptyB.pdf'"; + Map<String, String> data = getRow(contTable, where); + assertNull(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX + + ExtractComparer.aExtension)); + assertTrue(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX + + ExtractComparer.bExtension).equals(AbstractProfiler.JSON_PARSE_EXCEPTION)); + + where = fp+"='file5_emptyA.pdf'"; + data = getRow(contTable, where); + assertNull(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX + + ExtractComparer.bExtension)); + assertTrue(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+ + ExtractComparer.aExtension).equals(AbstractProfiler.JSON_PARSE_EXCEPTION)); + } + + @Test + public void testMissingAttachment() throws Exception { + String where = fp+"='file2_attachANotB.doc' and "+AbstractProfiler.HEADERS.EMBEDDED_FILE_PATH+ + "='inner.txt'"; + Map<String, String> data = getRow(compJoinCont, where); + assertContains("attachment: 1", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.name())); + assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.name())); + assertNull(data.get(ExtractComparer.HEADERS.TOP_N_TOKENS + + ExtractComparer.bExtension)); + assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS + + ExtractComparer.bExtension)); + + assertEquals("3", data.get("NUM_METADATA_VALUES_A")); + assertNull(data.get("DIFF_NUM_ATTACHMENTS")); + assertNull(data.get("NUM_METADATA_VALUES_B")); + assertEquals("0", data.get("NUM_UNIQUE_TOKENS_B")); + assertNull(data.get("TOKEN_ENTROPY_RATE_B")); + assertNull(data.get("NUM_EN_STOPS_TOP_N_B")); + + where = fp+"='file3_attachBNotA.doc' and "+AbstractProfiler.HEADERS.EMBEDDED_FILE_PATH+ + "='inner.txt'"; + data = getRow(compJoinCont, where); + assertContains("attachment: 1", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.name())); + assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.name())); + assertNull(data.get(ExtractComparer.HEADERS.TOP_N_TOKENS + + ExtractComparer.aExtension)); + assertNotContained("fox", data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS + + ExtractComparer.aExtension)); + + assertEquals("3", data.get("NUM_METADATA_VALUES_B")); + assertNull(data.get("DIFF_NUM_ATTACHMENTS")); + assertNull(data.get("NUM_METADATA_VALUES_A")); + assertEquals("0", data.get("NUM_UNIQUE_TOKENS_A")); + assertNull(data.get("TOKEN_ENTROPY_RATE_A")); + assertNull(data.get("NUM_EN_STOPS_TOP_N_A")); + + } + + @Test + public void testBothBadJson() throws Exception { + debugDumpAll(contTable); + String where = fp+"='file7_badJson.pdf'"; + Map<String, String> data = getRow(contTable, where); + assertEquals(AbstractProfiler.JSON_PARSE_EXCEPTION, + data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+ ExtractComparer.aExtension)); + assertEquals(AbstractProfiler.JSON_PARSE_EXCEPTION, + data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+ ExtractComparer.bExtension)); + assertEquals("file7_badJson.pdf", + data.get(AbstractProfiler.CONTAINER_HEADERS.FILE_PATH.name())); + assertEquals("61", data.get("JSON_FILE_LENGTH_A")); + assertEquals("0", data.get("JSON_FILE_LENGTH_B")); + assertEquals("pdf", data.get(AbstractProfiler.CONTAINER_HEADERS.FILE_EXTENSION.name())); + + } + + @Test + public void testAccessPermissionException() throws Exception { + String sql = "select "+ + AbstractProfiler.EXCEPTION_HEADERS.ACCESS_PERMISSION_EXCEPTION.name() + + " from " + AbstractProfiler.EXCEPTIONS_TABLE+"_A exA "+ + " join " + ExtractComparer.COMPARISONS_TABLE + " cmp on cmp.ID=exA.ID "+ + " join " + ExtractComparer.CONTAINERS_TABLE + " cont on cmp.CONTAINER_ID=cont.CONTAINER_ID "+ + " where "+fp+"='file6_accessEx.pdf'"; + Statement st = conn.createStatement(); + ResultSet rs = st.executeQuery(sql); + List<String> results = new ArrayList<String>(); + while (rs.next()) { + results.add(rs.getString(1)); + } + assertEquals(1, results.size()); + assertEquals("TRUE", results.get(0)); + + sql = "select "+ + AbstractProfiler.EXCEPTION_HEADERS.ACCESS_PERMISSION_EXCEPTION.name() + + " from " + AbstractProfiler.EXCEPTIONS_TABLE+"_B exB "+ + " join " + ExtractComparer.COMPARISONS_TABLE + " cmp on cmp.ID=exB.ID "+ + " join " + ExtractComparer.CONTAINERS_TABLE + " cont on cmp.CONTAINER_ID=cont.CONTAINER_ID "+ + " where "+fp+"='file6_accessEx.pdf'"; + st = conn.createStatement(); + rs = st.executeQuery(sql); + results = new ArrayList<String>(); + while (rs.next()) { + results.add(rs.getString(1)); + } + assertEquals(1, results.size()); + assertEquals("TRUE", results.get(0)); + + } + + @Test + public void testContainerException() throws Exception { + String sql = "select * "+ + " from " + AbstractProfiler.EXCEPTIONS_TABLE+"_A exA "+ + " join " + ExtractComparer.COMPARISONS_TABLE + " cmp on cmp.ID=exA.ID "+ + " join " + ExtractComparer.CONTAINERS_TABLE + " cont on cmp.CONTAINER_ID=cont.CONTAINER_ID "+ + "where "+fp+"='file8_IOEx.pdf'"; + Statement st = conn.createStatement(); + ResultSet rs = st.executeQuery(sql); + + Map<String, String> data = new HashMap<String,String>(); + ResultSetMetaData rsM = rs.getMetaData(); + while (rs.next()) { + for (int i = 1; i <= rsM.getColumnCount(); i++) + data.put(rsM.getColumnName(i), rs.getString(i)); + } + + String sortStack = data.get(AbstractProfiler.EXCEPTION_HEADERS.SORT_STACK_TRACE.name()); + sortStack = sortStack.replaceAll("[\r\n]", "<N>"); + assertTrue(sortStack.startsWith("java.lang.RuntimeException<N>")); + + String fullStack = data.get(AbstractProfiler.EXCEPTION_HEADERS.ORIG_STACK_TRACE.name()); + assertTrue( + fullStack.startsWith("java.lang.RuntimeException: java.io.IOException: Value is not an integer")); + } + + private void debugDumpAll(String table) throws Exception { + Statement st = conn.createStatement(); + String sql = "select * from "+table; + ResultSet rs = st.executeQuery(sql); + ResultSetMetaData m = rs.getMetaData(); + for (int i = 1; i <= m.getColumnCount(); i++) { + System.out.print(m.getColumnName(i) + ", "); + } + System.out.println("\n"); + while (rs.next()) { + for (int i = 1; i <= m.getColumnCount(); i++) { + System.out.print(rs.getString(i)+", "); + } + System.out.println("\n"); + } + st.close(); + + } + */ + private void debugShowColumns(String table) throws Exception { + Statement st = conn.createStatement(); + String sql = "select * from "+table; + ResultSet rs = st.executeQuery(sql); + ResultSetMetaData m = rs.getMetaData(); + for (int i = 1; i <= m.getColumnCount(); i++) { + System.out.println(i+" : "+m.getColumnName(i)); + } + st.close(); + } + + //return the string value for one cell + private String getString(String colName, String table, String where) throws Exception { + List<String> results = getColStrings(colName, table, where); + if (results.size() > 1) { + throw new RuntimeException("more than one result"); + } else if (results.size() == 0) { + throw new RuntimeException("no results"); + } + + return results.get(0); + } + + + private Map<String, String> getRow(String table, String where) throws Exception { + String sql = getSql("*", table, where); + Map<String, String> results = new HashMap<String, String>(); + Statement st = null; + + try { + st = conn.createStatement(); + ResultSet rs = st.executeQuery(sql); + ResultSetMetaData m = rs.getMetaData(); + int rows = 0; + while (rs.next()) { + if (rows > 0) { + throw new RuntimeException("returned more than one row!"); + } + for (int i = 1; i <= m.getColumnCount(); i++) { + results.put(m.getColumnName(i), rs.getString(i)); + } + rows++; + } + } finally { + if (st != null) { + st.close(); + } + } + return results; + + } + + //return the string representations of the column values for one column + //as a list of strings + private List<String> getColStrings(String colName) throws Exception { + return getColStrings(colName, ExtractComparer.CONTENT_COMPARISONS.getName(), null); + } + + private List<String> getColStrings(String colName, String table, String where) throws Exception { + String sql = getSql(colName, table, where); + List<String> results = new ArrayList<>(); + Statement st = null; + try { + st = conn.createStatement(); + System.out.println("SQL: "+sql); + ResultSet rs = st.executeQuery(sql); + while (rs.next()) { + results.add(rs.getString(1)); + } + } finally { + if (st != null) { + st.close(); + } + } + return results; + } + + private String getSql(String colName, String table, String where) { + StringBuilder sb = new StringBuilder(); + sb.append("select ").append(colName).append(" from ").append(table); + if (where != null && ! where.equals("")) { + sb.append(" where ").append(where); + } + return sb.toString(); + } + +}
http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/ProfilerBatchTest.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/ProfilerBatchTest.java b/tika-eval/src/test/java/org/apache/tika/eval/ProfilerBatchTest.java new file mode 100644 index 0000000..c3f0a7e --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/ProfilerBatchTest.java @@ -0,0 +1,236 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.apache.tika.eval.db.Cols; +import org.apache.tika.eval.db.H2Util; +import org.apache.tika.eval.db.TableInfo; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Ignore; +import org.junit.Test; +@Ignore +public class ProfilerBatchTest { + + public final static String COMPARER_PROCESS_CLASS = "org.apache.tika.batch.fs.FSBatchProcessCLI"; + + private static Path dbDir; + private static Connection conn; + + private final static String profileTable = ExtractProfiler.PROFILE_TABLE.getName(); + private final static String exTable = ExtractProfiler.EXCEPTION_TABLE.getName(); + private final static String fpCol = Cols.FILE_PATH.name(); + + @BeforeClass + public static void setUp() throws Exception { + + Path inputRoot = Paths.get(new ComparerBatchTest().getClass().getResource("/test-dirs/extractsA").toURI()); + dbDir = Files.createTempDirectory(inputRoot, "tika-test-db-dir-"); + Map<String, String> args = new HashMap<>(); + Path db = dbDir.resolve("profiler_test"); + args.put("-db", db.toString()); + + //for debugging, you can use this to select only one file pair to load + //args.put("-includeFilePat", "file8.*"); + + /* BatchProcessTestExecutor ex = new BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args, + "/single-file-profiler-crawl-input-config.xml"); + StreamStrings streamStrings = ex.execute(); + System.out.println(streamStrings.getErrString()); + System.out.println(streamStrings.getOutString());*/ + H2Util dbUtil = new H2Util(db); + conn = dbUtil.getConnection(true); + } + @AfterClass + public static void tearDown() throws IOException { + + try{ + conn.close(); + } catch (SQLException e) { + throw new RuntimeException(e); + } + + + DirectoryStream<Path> dStream = Files.newDirectoryStream(dbDir); + Iterator<Path> it = dStream.iterator(); + while (it.hasNext()) { + Path p = it.next(); + Files.delete(p); + } + dStream.close(); + Files.delete(dbDir); + } + + @Test + public void testSimpleDBWriteAndRead() throws Exception { + + Statement st = null; + List<String> fNameList = new ArrayList<>(); + try { + String sql = "select * from "+ ExtractProfiler.CONTAINER_TABLE.getName(); + st = conn.createStatement(); + ResultSet rs = st.executeQuery(sql); + while (rs.next()) { + String fileName = rs.getString(Cols.FILE_PATH.name()); + fNameList.add(fileName); + } + } finally { + if (st != null) { + st.close(); + } + } + debugTable(ExtractProfiler.CONTAINER_TABLE); + debugTable(ExtractProfiler.PROFILE_TABLE); + debugTable(ExtractProfiler.CONTENTS_TABLE); + debugTable(ExtractProfiler.EXCEPTION_TABLE); + debugTable(ExtractProfiler.ERROR_TABLE); + assertEquals(10, fNameList.size()); + assertTrue("file1.pdf", fNameList.contains("file1.pdf")); + assertTrue("file2_attachANotB.doc", fNameList.contains("file2_attachANotB.doc")); + assertTrue("file3_attachBNotA.doc", fNameList.contains("file3_attachBNotA.doc")); + assertTrue("file4_emptyB.pdf", fNameList.contains("file4_emptyB.pdf")); + assertTrue("file7_badJson.pdf", fNameList.contains("file7_badJson.pdf")); + } + + @Test + public void testExtractErrors() throws Exception { + String sql = "select EXTRACT_ERROR_TYPE_ID from errors e" + + " join containers c on c.container_id = e.container_id "+ + " where c.file_path='file9_noextract.txt'"; + + assertEquals("missing extract: file9_noextract.txt", "0", + getSingleResult(sql)); + debugTable(ExtractProfiler.CONTAINER_TABLE); + debugTable(ExtractProfiler.PROFILE_TABLE); + debugTable(ExtractProfiler.CONTENTS_TABLE); + debugTable(ExtractProfiler.EXCEPTION_TABLE); + debugTable(ExtractProfiler.ERROR_TABLE); + + sql = "select EXTRACT_ERROR_TYPE_ID from errors e" + + " join containers c on c.container_id = e.container_id "+ + " where c.file_path='file5_emptyA.pdf'"; + assertEquals("empty extract: file5_emptyA.pdf", "1", + getSingleResult(sql)); + + sql = "select EXTRACT_ERROR_TYPE_ID from errors e" + + " join containers c on c.container_id = e.container_id "+ + " where c.file_path='file7_badJson.pdf'"; + assertEquals("extract error:file7_badJson.pdf", "2", + getSingleResult(sql)); + + } + + @Test + public void testParseErrors() throws Exception { + debugTable(ExtractProfiler.ERROR_TABLE); + String sql = "select file_path from errors where container_id is null"; + assertEquals("file10_permahang.txt", + getSingleResult(sql)); + + sql = "select extract_error_type_id from errors where file_path='file11_oom.txt'"; + assertEquals(Integer.toString(AbstractProfiler. + EXTRACT_ERROR_TYPE. + ZERO_BYTE_EXTRACT_FILE.ordinal()), + getSingleResult(sql)); + + sql = "select parse_error_type_id from errors where file_path='file11_oom.txt'"; + assertEquals(Integer.toString(AbstractProfiler. + PARSE_ERROR_TYPE. + OOM.ordinal()), + getSingleResult(sql)); + + } + + @Test + public void testParseExceptions() throws Exception { + debugTable(ExtractProfiler.EXCEPTION_TABLE); + } + + private String getSingleResult(String sql) throws Exception { + Statement st = null; + st = conn.createStatement(); + ResultSet rs = st.executeQuery(sql); + int hits = 0; + String val = ""; + while (rs.next()) { + assertEquals("must have only one column in result", + 1, rs.getMetaData().getColumnCount()); + val = rs.getString(1); + hits++; + } + assertEquals("must have only one hit", 1, hits); + return val; + } + + //TODO: lots more testing! + + public void debugTable(TableInfo table) throws Exception { + Statement st = null; + try { + String sql = "select * from "+table.getName(); + st = conn.createStatement(); + ResultSet rs = st.executeQuery(sql); + int colCount = rs.getMetaData().getColumnCount(); + System.out.println("TABLE: "+table.getName()); + for (int i = 1; i <= colCount; i++) { + if (i > 1) { + System.out.print(" | "); + } + System.out.print(rs.getMetaData().getColumnName(i)); + } + System.out.println(""); + int rowCount = 0; + while (rs.next()) { + for (int i = 1; i <= colCount; i++) { + if (i > 1) { + System.out.print(" | "); + } + System.out.print(rs.getString(i)); + rowCount++; + } + System.out.println(""); + } + if (rowCount == 0) { + System.out.println(table.getName() + " was empty"); + } + } finally { + if (st != null) { + st.close(); + } + } + + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java new file mode 100644 index 0000000..72e8008 --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java @@ -0,0 +1,289 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval; + +import static org.apache.tika.eval.AbstractProfiler.EXCEPTION_TYPE; +import static org.apache.tika.eval.AbstractProfiler.EXTRACT_ERROR_TYPE; +import static org.apache.tika.eval.AbstractProfiler.getContent; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; + +import org.apache.tika.MockDBWriter; +import org.apache.tika.TikaTest; +import org.apache.tika.eval.db.Cols; +import org.apache.tika.eval.db.TableInfo; +import org.apache.tika.eval.io.ExtractReader; +import org.apache.tika.eval.util.LanguageIDWrapper; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; + +//These tests ensure that the comparer is extracting the right information +//into a Map<String,String>. A full integration test +//should also ensure that the elements are properly being written to the db + +public class SimpleComparerTest extends TikaTest { + + private ExtractComparer comparer = null; + private MockDBWriter writer = null; + + @Before + public void setUp() throws Exception { + writer = new MockDBWriter(); + comparer = new ExtractComparer(null, null, + Paths.get("extractsA"), Paths.get("extractsB"), + writer, -1, -1, + ExtractReader.ALTER_METADATA_LIST.AS_IS); + AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/commontokens").toPath()); + LanguageIDWrapper.loadBuiltInModels(); + } + + @Test + public void testBasic() throws Exception { + EvalFilePaths fpsA = new EvalFilePaths( + Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath() + ); + EvalFilePaths fpsB = new EvalFilePaths( + Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsB/file1.pdf.json").toPath()); + + comparer.compareFiles(fpsA, fpsB); + + List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENT_COMPARISONS); + Map<Cols, String> row = tableInfos.get(0); + assertEquals("0", row.get(Cols.ID)); + assertTrue( + row.get(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A) + .startsWith("1,200: 1 | 120000: 1 | over: 1")); + + tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A); + row = tableInfos.get(0); + assertEquals("0", row.get(Cols.ID)); + assertEquals("70", row.get(Cols.CONTENT_LENGTH)); + assertEquals("10", row.get(Cols.NUM_UNIQUE_TOKENS)); + assertEquals("14", row.get(Cols.NUM_TOKENS)); + assertEquals("12", row.get(Cols.NUM_ALPHABETIC_TOKENS)); + assertEquals("6", row.get(Cols.NUM_COMMON_TOKENS)); + assertEquals("57", row.get(Cols.TOKEN_LENGTH_SUM)); + assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG)); + + tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_B); + row = tableInfos.get(0); + assertEquals("0", row.get(Cols.ID)); + assertEquals("76", row.get(Cols.CONTENT_LENGTH)); + assertEquals("9", row.get(Cols.NUM_UNIQUE_TOKENS)); + assertEquals("13", row.get(Cols.NUM_TOKENS)); + assertEquals("4", row.get(Cols.NUM_COMMON_TOKENS)); + assertEquals("64", row.get(Cols.TOKEN_LENGTH_SUM)); + assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG)); + + tableInfos = writer.getTable(ExtractComparer.PROFILES_A); + row = tableInfos.get(0); + assertEquals("2", row.get(Cols.NUM_PAGES)); + + } + + @Test + public void testBasicSpanish() throws Exception { + EvalFilePaths fpsA = new EvalFilePaths( + Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsA/file12_es.txt.json").toPath() + ); + EvalFilePaths fpsB = new EvalFilePaths( + Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsB/file12_es.txt.json").toPath()); + + comparer.compareFiles(fpsA, fpsB); + + List<Map<Cols, String>> tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A); + + Map<Cols, String> row = tableInfos.get(0); + assertEquals("133", row.get(Cols.CONTENT_LENGTH)); + assertEquals("7", row.get(Cols.NUM_UNIQUE_TOKENS)); + assertEquals("24", row.get(Cols.NUM_TOKENS)); + assertEquals("3", row.get(Cols.NUM_COMMON_TOKENS)); + assertEquals("108", row.get(Cols.TOKEN_LENGTH_SUM)); + assertEquals("es", row.get(Cols.COMMON_TOKENS_LANG)); + assertEquals("24", row.get(Cols.NUM_ALPHABETIC_TOKENS)); + + } + + + @Test + public void testEmpty() throws Exception { + EvalFilePaths fpsA = new EvalFilePaths( + Paths.get("file1.pdf"), + getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath() + ); + EvalFilePaths fpsB = new EvalFilePaths( + Paths.get("file1.pdf"), + getResourceAsFile("/test-dirs/extractsB/file4_emptyB.pdf.json").toPath() + ); + comparer.compareFiles(fpsA, fpsB); + List<Map<Cols, String>> table = writer.getTable(ExtractComparer.ERROR_TABLE_B); + Map<Cols, String> row = table.get(0); + //debugPrintRow(row); + assertEquals(Integer.toString(EXTRACT_ERROR_TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()), + row.get(Cols.EXTRACT_ERROR_TYPE_ID)); + } + + + @Test + public void testGetContent() throws Exception { + Metadata m = new Metadata(); + m.add(RecursiveParserWrapper.TIKA_CONTENT, "0123456789"); + + String content = getContent(m, 10); + assertEquals(10, content.length()); + + content = getContent(m, 4); + assertEquals(4, content.length()); + + //test Metadata with no content + content = getContent(new Metadata(), 10); + assertEquals(0, content.length()); + + //test null Metadata + content = getContent(null, 10); + assertEquals(0, content.length()); + } + + @Test + public void testAccessException() throws Exception { + EvalFilePaths fpsA = new EvalFilePaths( + Paths.get("file6_accessEx.pdf.json"), + getResourceAsFile("/test-dirs/extractsA/file6_accessEx.pdf.json").toPath() + ); + EvalFilePaths fpsB = new EvalFilePaths( + Paths.get("file6_accessEx.pdf.json"), + getResourceAsFile("/test-dirs/extractsB/file6_accessEx.pdf.json").toPath() + ); + comparer.compareFiles(fpsA, fpsB); + for (TableInfo t : new TableInfo[]{ExtractComparer.EXCEPTION_TABLE_A, ExtractComparer.EXCEPTION_TABLE_B}) { + List<Map<Cols, String>> table = writer.getTable(t); + + Map<Cols, String> rowA = table.get(0); + //debugPrintRow(rowA); + assertEquals(Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()), + rowA.get(Cols.PARSE_EXCEPTION_TYPE_ID)); + assertNull(rowA.get(Cols.ORIG_STACK_TRACE)); + assertNull(rowA.get(Cols.SORT_STACK_TRACE)); + } + } + + + @Test + public void testAttachmentCounts() { + List<Metadata> list = new ArrayList<>(); + Metadata m0 = new Metadata(); + m0.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "dir1/dir2/file.zip");//bad data should be ignored + //in the first metadata object + list.add(m0); + Metadata m1 = new Metadata(); + m1.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text1.txt"); + list.add(m1); + Metadata m2 = new Metadata(); + m2.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip/text2.txt"); + list.add(m2); + Metadata m3 = new Metadata(); + m3.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/f2.zip"); + list.add(m3); + Metadata m4 = new Metadata(); + m4.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx"); + list.add(m4); + Metadata m5 = new Metadata(); + m5.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx/text3.txt"); + list.add(m5); + + List<Integer> counts = AbstractProfiler.countAttachments(list); + + List<Integer> expected = new ArrayList<>(); + expected.add(5); + expected.add(0); + expected.add(0); + expected.add(2); + expected.add(4); + expected.add(0); + assertEquals(expected, counts); + } + + + @Test + @Ignore + public void testDebug() throws Exception { + Path commonTokens = Paths.get(getResourceAsFile("/common_tokens_short.txt").toURI()); + AbstractProfiler.loadCommonTokens(commonTokens); + EvalFilePaths fpsA = new EvalFilePaths( + Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath() + ); + EvalFilePaths fpsB = new EvalFilePaths( + Paths.get("file1.pdf.json"), + getResourceAsFile("/test-dirs/extractsB/file1.pdf.json").toPath() + ); + comparer.compareFiles(fpsA, fpsB); + for (TableInfo t : new TableInfo[]{ + ExtractComparer.COMPARISON_CONTAINERS, + ExtractComparer.ERROR_TABLE_A, + ExtractComparer.ERROR_TABLE_B, + ExtractComparer.EXCEPTION_TABLE_A, + ExtractComparer.EXCEPTION_TABLE_B, + ExtractComparer.PROFILES_A, + ExtractComparer.PROFILES_B, + ExtractComparer.CONTENTS_TABLE_A, + ExtractComparer.CONTENTS_TABLE_B, + ExtractComparer.CONTENT_COMPARISONS}) { + //debugPrintTable(t); + } + } + + private void debugPrintTable(TableInfo tableInfo) { + List<Map<Cols, String>> table = writer.getTable(tableInfo); + if (table == null) { + return; + } + int i = 0; + System.out.println("TABLE: "+tableInfo.getName()); + for (Map<Cols, String> row : table) { + SortedSet<Cols> keys = new TreeSet<Cols>(row.keySet()); + for (Cols key : keys) { + System.out.println( i + " :: " + key + " : " + row.get(key)); + } + i++; + } + System.out.println(""); + } + + private void debugPrintRow(Map<Cols, String> row) { + SortedSet<Cols> keys = new TreeSet<Cols>(row.keySet()); + for (Cols key : keys) { + System.out.println(key + " : " + row.get(key)); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java new file mode 100644 index 0000000..c358149 --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.Ignore; +import org.junit.Test; + +public class TikaEvalCLITest { + + @Test + @Ignore("TODO: add real tests") + public void testBasic() throws Exception { + List<String> args = new ArrayList<>(); + args.add("Profile"); + args.add("-extractDir"); + args.add("tika"); + args.add("-db"); + args.add("mydb"); + args.add("-alterExtract"); + args.add("first_only"); + TikaEvalCLI.main(args.toArray(new String[args.size()])); + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java b/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java new file mode 100644 index 0000000..7b5c3cb --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.db; + + +import static org.junit.Assert.assertEquals; + +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.Callable; +import java.util.concurrent.CompletionService; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.ExecutorCompletionService; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; + +import org.junit.Test; + +public class AbstractBufferTest { + + + @Test(timeout = 30000) + public void runTest() throws InterruptedException, ExecutionException { + List<String> keys = new ArrayList<>(); + Collections.addAll(keys, new String[]{ + "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}); + + int numGets = 1000; + int numTesters = 20; + AbstractDBBuffer b = new TestBuffer(); + + + ExecutorService ex = Executors.newFixedThreadPool(numTesters); + CompletionService<MyTestResult> completionService = + new ExecutorCompletionService<>( + ex); + for (int i = 0; i < numTesters; i++) { + completionService.submit(new Tester(keys, b, numGets)); + } + + int results = 0; + Map<String, Integer> combined = new HashMap<>(); + while (results < numTesters) { + Future<MyTestResult> futureResult = + completionService.poll(1, TimeUnit.SECONDS); + if (futureResult != null) { + results++; + assertEquals(keys.size(), futureResult.get().getMap().keySet().size()); + for (Map.Entry<String, Integer> e : futureResult.get().getMap().entrySet()) { + if (!combined.containsKey(e.getKey())) { + combined.put(e.getKey(), e.getValue()); + } else { + assertEquals(combined.get(e.getKey()), e.getValue()); + } + } + } + } + assertEquals(keys.size(), b.getNumWrites()); + } + + private class Tester implements Callable<MyTestResult> { + + private Random r = new Random(); + private Map<String, Integer> m = new HashMap<>(); + List<String> keys = new ArrayList<>(); + private final AbstractDBBuffer dbBuffer; + private final int numGets; + + private Tester(List<String> inputKeys, AbstractDBBuffer buffer, int numGets) { + keys.addAll(inputKeys); + dbBuffer = buffer; + this.numGets = numGets; + } + + @Override + public MyTestResult call() throws Exception { + + + for (int i = 0; i < numGets; i++) { + int index = r.nextInt(keys.size()); + String k = keys.get(index); + if (k == null) { + throw new RuntimeException("keys can't be null"); + } + Integer expected = m.get(k); + Integer val = dbBuffer.getId(k); + if (val == null) { + throw new RuntimeException("Val can't be null!"); + } + if (expected != null) { + assertEquals(expected, val); + } + m.put(k, val); + } + + //now add the val for every key + //just in case the rand() process didn't hit + //all indices + for (String k : keys) { + Integer val = dbBuffer.getId(k); + m.put(k, val); + } + MyTestResult r = new MyTestResult(m); + return r; + } + } + + private class MyTestResult { + Map<String, Integer> m; + private MyTestResult(Map<String, Integer> m) { + this.m = m; + } + private Map<String, Integer> getMap() { + return m; + } + + @Override + public String toString() { + return "MyTester: "+m.size(); + } + } + + private class TestBuffer extends AbstractDBBuffer { + @Override + public void write(int id, String value) throws RuntimeException { + try { + Thread.sleep(100); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + } + + @Override + public void close() throws SQLException { + //no-op + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java b/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java new file mode 100644 index 0000000..14c0013 --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.io; + + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.List; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.junit.Before; +import org.junit.Test; + +public class ExtractReaderTest extends TikaTest { + + private Path testJsonFile; + private Path testTxtFile; + + @Before + public void setUp() throws Exception { + testJsonFile = getResourceAsFile("/test-dirs/extractsA/file2_attachANotB.doc.json").toPath(); + testTxtFile = getResourceAsFile("/test-dirs/extractsB/file13_attachANotB.doc.txt").toPath(); + } + + @Test + public void testBasic() throws Exception { + + ExtractReader extractReader = new ExtractReader(); + List<Metadata> metadataList = extractReader.loadExtract(testJsonFile, + ExtractReader.ALTER_METADATA_LIST.AS_IS); + assertEquals(2, metadataList.size()); + assertEquals(1, metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length); + assertEquals(1, metadataList.get(1).getValues(RecursiveParserWrapper.TIKA_CONTENT).length); + assertContains("fox", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("attachment", metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + + metadataList = extractReader.loadExtract(testJsonFile, ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY); + assertEquals(1, metadataList.size()); + assertEquals(1, metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length); + assertContains("fox", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertNotContained("attachment", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + + metadataList = extractReader.loadExtract(testJsonFile, ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST); + assertEquals(1, metadataList.size()); + assertEquals(1, metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length); + assertContains("fox", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertContains("attachment", metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT)); + } + + @Test + public void testTextBasic() throws IOException { + ExtractReader extractReader = new ExtractReader(); + List<Metadata> metadataList = extractReader.loadExtract(testTxtFile, + ExtractReader.ALTER_METADATA_LIST.AS_IS); + assertEquals(1, metadataList.size()); + Metadata m = metadataList.get(0); + assertEquals(1, m.getValues(RecursiveParserWrapper.TIKA_CONTENT).length); + assertEquals("the quick brown fox fox fox jumped over the lazy lazy dog\n", + m.get(RecursiveParserWrapper.TIKA_CONTENT)); + + //test that the mime is inferred from the file extension + assertEquals("application/msword", m.get(Metadata.CONTENT_TYPE)); + } + + + +} http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/io/FatalExceptionReaderTest.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/io/FatalExceptionReaderTest.java b/tika-eval/src/test/java/org/apache/tika/eval/io/FatalExceptionReaderTest.java new file mode 100644 index 0000000..3b99a76 --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/io/FatalExceptionReaderTest.java @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.io; + + +import java.io.InputStream; + +import org.junit.Test; + +public class FatalExceptionReaderTest { + @Test + public void testSimpleRead() throws Exception { + InputStream is = this.getClass().getResourceAsStream("/test-dirs/batch-logs/batch-process-fatal.xml"); + XMLLogReader reader = new XMLLogReader(); + //reader.read(is); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/reports/ResultsReporterTest.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/reports/ResultsReporterTest.java b/tika-eval/src/test/java/org/apache/tika/eval/reports/ResultsReporterTest.java new file mode 100644 index 0000000..5e43303 --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/reports/ResultsReporterTest.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.reports; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.sql.Connection; +import java.sql.Statement; + +import org.apache.tika.eval.db.H2Util; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; + +public class ResultsReporterTest { + private Path configFile; + private Path tmpDir; + private String dbName = "test-db"; + private Connection connection; + + @Before + public void setUp() throws Exception { + configFile = Paths.get(this.getClass().getResource("/reports.xml").toURI()); + tmpDir = Files.createTempDirectory("tika-eval-report-test-"); + + connection = new H2Util(tmpDir.resolve(dbName)).getConnection(true); + String sql = "CREATE TABLE test_table (ID LONG PRIMARY KEY, STRING VARCHAR(32))"; + Statement st = connection.createStatement(); + st.execute(sql); + sql = "INSERT into test_table values ( 100000, 'the quick brown')"; + st.execute(sql); + sql = "INSERT into test_table values (123456789, 'fox jumped over')"; + st.execute(sql); + connection.commit(); + } + + @Test + @Ignore("add a real test here") + public void testBuilder() throws Exception { + ResultsReporter r = ResultsReporter.build(configFile); + r.execute(connection, Paths.get("reports")); + System.out.println("finished: "+ tmpDir.toString()); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java b/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java new file mode 100644 index 0000000..486cac7 --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.tokens; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import org.apache.commons.math3.stat.descriptive.SummaryStatistics; +import org.apache.commons.math3.util.FastMath; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.util.BytesRef; + +/** + * Experimental class uses Lucene's MemoryIndex to effectively build the + * token info. + */ +public class LuceneTokenCounter { + private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a"; + + private final LeafReader leafReader; + private final MemoryIndex memoryIndex; + private final Analyzer generalAnalyzer; + private final Analyzer alphaIdeographAnalyzer; + private int topN = 10; + + Map<String, TokenStatistics> fieldStats = new HashMap<>(); + + public LuceneTokenCounter(Analyzer generalAnalyzer, Analyzer alphaIdeographAnalyzer) throws IOException { + memoryIndex = new MemoryIndex(); + IndexSearcher searcher = memoryIndex.createSearcher(); + leafReader = (LeafReader)searcher.getIndexReader(); + this.generalAnalyzer = generalAnalyzer; + this.alphaIdeographAnalyzer = alphaIdeographAnalyzer; + } + + public void add(String field, String content) throws IOException { + memoryIndex.addField(field, content, generalAnalyzer); + //memoryIndex.addField(field+ALPHA_IDEOGRAPH_SUFFIX, + // content, alphaIdeographAnalyzer); + count(field); + //count(field+ALPHA_IDEOGRAPH_SUFFIX); + + } + + + void count(String field) throws IOException { + long tokenCount = leafReader.getSumTotalTermFreq(field); + if (tokenCount > Integer.MAX_VALUE) { + throw new IllegalArgumentException("can't handle longs"); + } + int tokenCountInt = (int)tokenCount; + int uniqueTokenCount = 0; + SummaryStatistics summStats = new SummaryStatistics(); + double ent = 0.0d; + double p = 0.0d; + double base = 2.0; + + Terms terms = leafReader.terms(field); + if (terms == null) { + //if there were no terms + fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, + new TokenIntPair[0], ent, summStats)); + return; + + } + TermsEnum termsEnum = terms.iterator(); + BytesRef bytesRef = termsEnum.next(); + TokenCountPriorityQueue queue= new TokenCountPriorityQueue(topN); + + while (bytesRef != null) { + + long termFreq = termsEnum.totalTermFreq(); + if (termFreq > Integer.MAX_VALUE) { + throw new IllegalArgumentException("Sorry can't handle longs yet"); + } + int tf = (int)termFreq; + //TODO: figure out how to avoid Stringifying this + //to get codepoint count + String t = bytesRef.utf8ToString(); + int len = t.codePointCount(0, t.length()); + for (int i = 0; i < tf; i++) { + summStats.addValue(len); + } + p = (double) tf / (double) tokenCount; + ent += p * FastMath.log(base, p); + + if (queue.top() == null || queue.size() < topN || + tf >= queue.top().getValue()) { + queue.insertWithOverflow(new TokenIntPair(t, tf)); + } + + uniqueTokenCount++; + bytesRef = termsEnum.next(); + } + if (tokenCountInt > 0) { + ent = (-1.0d / (double)tokenCountInt) * ent; + } + + fieldStats.put(field, new TokenStatistics(uniqueTokenCount, tokenCountInt, + queue.getArray(), ent, summStats)); + } + + public void setTopN(int topN) { + this.topN = topN; + } + + public TokenStatistics getTokenStatistics(String field) { + return fieldStats.get(field); + } + public Terms getAlphaTerms(String field) throws IOException { + return leafReader.terms(field+ALPHA_IDEOGRAPH_SUFFIX); + } + public Terms getTerms(String field) throws IOException { + return leafReader.terms(field); + } + + + public void clear() { + memoryIndex.reset(); + fieldStats.clear(); + } +/* + public ContrastStatistics contrast(String fieldA, String fieldB) throws IOException { + long diceDenom = getUniqueTokenCount(fieldA) + + getUniqueTokenCount(fieldB); + + long diceNum = 0; + long overlapNum = 0; + + Terms termsA = getTerms(fieldA); + Terms termsB = getTerms(fieldB); + + TermsEnum termsEnumA = termsA.iterator(); + TermsEnum termsEnumB = termsB.iterator(); + + BytesRef bytesRefA = termsEnumA.next(); + BytesRef bytesRefB = termsEnumB.next(); + + while (bytesRefA != null) { + int compare = bytesRefA.compareTo(bytesRefB); + while (compare > 0) { + if (bytesRefB == null) { + break; + } + //handle term in B, but not A + + compare = bytesRefA.compareTo(bytesRefB); + bytesRefB = termsEnumB.next(); + } + if (compare == 0) { + diceNum += 2; + overlapNum += 2 * Math.min(termsEnumA.totalTermFreq(), termsEnumB.totalTermFreq()); + } + + bytesRefA = termsEnumA.next(); + } + + + for (PairCount p : tokens.values()) { + if (p.a > 0 && p.b > 0) { + diceNum += 2; + overlapNum += 2 * Math.min(p.a, p.b); + } + } + + float dice = (float) diceNum / (float) diceDenom; + float overlap = (float) overlapNum / (float) (theseTokens.getTokenCount() + thoseTokens.getTokenCount()); + } +*/ +} http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java new file mode 100644 index 0000000..719b56c --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.tokens; + + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; + +import org.apache.commons.lang3.mutable.MutableInt; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.junit.BeforeClass; +import org.junit.Test; + +public class TokenCounterTest { + private final static String FIELD = "f"; + private static AnalyzerManager analyzerManager; + + private final int topN = 10; + + @BeforeClass + public static void setUp() throws IOException { + analyzerManager = AnalyzerManager.newInstance(); + + } + + @Test + public void testBasic() throws Exception { + String s = " bde cde def abc efg f f f f ghijklmnop a a a a a a a a a a a a a a a a a b b b b b b b b b b b b b"; + TokenCounter counter = new TokenCounter(analyzerManager.getGeneralAnalyzer(), + analyzerManager.getAlphaIdeoAnalyzer()); + counter.add(FIELD, s); + TokenStatistics simpleTokenStatistics = counter.getTokenStatistics(FIELD); + LuceneTokenCounter tokenCounter = new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer(), + analyzerManager.getAlphaIdeoAnalyzer()); + tokenCounter.add(FIELD, s); + assertEquals(simpleTokenStatistics, tokenCounter.getTokenStatistics(FIELD)); + } + + @Test + public void testRandom() throws Exception { + + long simple = 0; + long lucene = 0; + int numberOfTests = 100; + for (int i = 0; i < numberOfTests; i++) { + String s = generateString(); + long start = new Date().getTime(); + TokenCounter counter = new TokenCounter(analyzerManager.getGeneralAnalyzer(), + analyzerManager.getAlphaIdeoAnalyzer()); + counter.add(FIELD, s); + simple += new Date().getTime()-start; + TokenStatistics simpleTokenStatistics = counter.getTokenStatistics(FIELD); + + start = new Date().getTime(); + LuceneTokenCounter tokenCounter = new LuceneTokenCounter(analyzerManager.getGeneralAnalyzer(), + analyzerManager.getAlphaIdeoAnalyzer()); + tokenCounter.add(FIELD, s); + lucene += new Date().getTime()-start; + assertEquals(s, simpleTokenStatistics, tokenCounter.getTokenStatistics(FIELD)); + } + + //System.out.println("SIMPLE: " + simple + " lucene: "+lucene); + } + + @Test + public void testCommonTokens() throws Exception { + TokenCounter tokenCounter = new TokenCounter(analyzerManager.getGeneralAnalyzer(), + analyzerManager.getAlphaIdeoAnalyzer()); + String s = "the http://www.cnn.com and blahdeb...@apache.org are in valuable www.sites.org æ®ææ¯é¡¿å¤§å¦"; + tokenCounter.add(FIELD, s); + Map<String, MutableInt> tokens = tokenCounter.getAlphaTokens(FIELD); + assertEquals(new MutableInt(2), tokens.get("___url___")); + assertEquals(new MutableInt(1), tokens.get("___email___")); + } + + @Test + public void testCJKFilter() throws Exception { + String s = "then quickbrownfoxjumpedoverthelazy dogss dog æ®ææ¯é¡¿å¤§å¦"; + Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer(); + TokenStream ts = analyzer.tokenStream(FIELD, s); + CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); + ts.reset(); + Map<String, Integer> tokens = new HashMap<>(); + while (ts.incrementToken()) { + String t = termAtt.toString(); + Integer count = tokens.get(t); + count = (count == null) ? count = 0 : count; + count++; + tokens.put(t, count); + } + ts.end(); + ts.close(); + assertEquals(7, tokens.size()); + assertEquals(new Integer(1), tokens.get("ææ¯")); + } + + private String generateString() { + + Random r = new Random(); + int len = r.nextInt(1000); + int uniqueVocabTerms = 10000; + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < len; i++) { + sb.append(Integer.toString(r.nextInt(uniqueVocabTerms)+100000)); + sb.append(" "); + } + return sb.toString(); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/java/org/apache/tika/eval/util/MimeUtilTest.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/java/org/apache/tika/eval/util/MimeUtilTest.java b/tika-eval/src/test/java/org/apache/tika/eval/util/MimeUtilTest.java new file mode 100644 index 0000000..40b7484 --- /dev/null +++ b/tika-eval/src/test/java/org/apache/tika/eval/util/MimeUtilTest.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.util; + + +import static junit.framework.TestCase.assertTrue; +import static org.junit.Assert.assertEquals; + +import org.apache.tika.config.TikaConfig; +import org.apache.tika.mime.MimeType; +import org.apache.tika.mime.MimeTypeException; +import org.apache.tika.mime.MimeTypes; +import org.junit.Ignore; +import org.junit.Test; + +@Ignore("Fix mimetype.getExtension to work with these and then we can get rid of MimeUtil") +public class MimeUtilTest { + + private final TikaConfig config = TikaConfig.getDefaultConfig(); + + @Test + public void testBasic() throws Exception { + assertResult("application/pdf", ".pdf"); + assertResult("APPLICATION/PDF", ".pdf"); + assertResult("text/plain; charset=ISO-8859-1", ".txt"); + assertResult("application/xhtml+xml; charset=UTF-8\n", ".html"); + assertResult("application/xml; charset=UTF-8\n", ".xml"); + + assertException("bogosity", "xml"); + } + + private void assertException(String contentType, String expected) { + boolean ex = false; + try { + assertResult(contentType, expected); + } catch (MimeTypeException e) { + ex = true; + } + assertTrue("Should have had exception for: " + contentType, ex); + } + + private void assertResult(String contentType, String expected) throws MimeTypeException { + TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); + MimeTypes r = tikaConfig.getMimeRepository(); + MimeType mt = r.forName(contentType); + +// String ext = MimeUtil.getExtension(contentType, config); + assertEquals(expected, mt.getExtension()); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/commontokens/en ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/commontokens/en b/tika-eval/src/test/resources/commontokens/en new file mode 100644 index 0000000..8d442fe --- /dev/null +++ b/tika-eval/src/test/resources/commontokens/en @@ -0,0 +1,8 @@ +the +of +and +a +or +#quick +brown +fox \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/commontokens/es ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/commontokens/es b/tika-eval/src/test/resources/commontokens/es new file mode 100644 index 0000000..b9bfd03 --- /dev/null +++ b/tika-eval/src/test/resources/commontokens/es @@ -0,0 +1,10 @@ +la +de +y +una + + +o +rápido +marrón +zorro \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/commontokens/zh-cn ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/commontokens/zh-cn b/tika-eval/src/test/resources/commontokens/zh-cn new file mode 100644 index 0000000..bec617d --- /dev/null +++ b/tika-eval/src/test/resources/commontokens/zh-cn @@ -0,0 +1,8 @@ +ç +ç +å +ä¸å +è¦ä¹ +å¿« +æ£è² +çç¸ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/commontokens/zh-tw ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/commontokens/zh-tw b/tika-eval/src/test/resources/commontokens/zh-tw new file mode 100644 index 0000000..bc91291 --- /dev/null +++ b/tika-eval/src/test/resources/commontokens/zh-tw @@ -0,0 +1,8 @@ +ç +ç +å +ä¸ä¸ª +è¦ä¹ +å¿« +æ£è² +çç¸ \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/log4j.properties ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/log4j.properties b/tika-eval/src/test/resources/log4j.properties new file mode 100644 index 0000000..925f9f2 --- /dev/null +++ b/tika-eval/src/test/resources/log4j.properties @@ -0,0 +1,11 @@ + +log4j.rootLogger=WARN,A1 + +#for debugging +#log4j.rootLogger=TRACE,A1 + +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/log4j_process.properties ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/log4j_process.properties b/tika-eval/src/test/resources/log4j_process.properties new file mode 100644 index 0000000..cca8871 --- /dev/null +++ b/tika-eval/src/test/resources/log4j_process.properties @@ -0,0 +1,11 @@ + +log4j.rootLogger=TRACE,A1 + +#for debugging +#log4j.rootLogger=TRACE,A1 + +log4j.appender.A1=org.apache.log4j.ConsoleAppender + +# A1 uses PatternLayout. +log4j.appender.A1.layout=org.apache.log4j.PatternLayout +log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml b/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml new file mode 100644 index 0000000..9993ff1 --- /dev/null +++ b/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml @@ -0,0 +1,72 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<tika-batch-config + maxAliveTimeSeconds="-1" + pauseOnEarlyTerminationMillis="500" + timeoutCheckPulseMillis="1000" + maxQueueSize="10000" + numConsumers="5" + timeoutThresholdMillis="300000"> + + <commandline> + <option opt="c" longOpt="tika-config" hasArg="true" + description="TikaConfig file"/> + + <option opt="bc" longOpt="batch-config" hasArg="true" + description="xml batch config file" required="true"/> + <option opt="inputDir" hasArg="true" + description="dir to start crawling"/> + <option opt="numConsumers" hasArg="true" + description="number of fileConsumers threads"/> + <option opt="extractDir" hasArg="true" + description="this dir contains the files containing extracted metadata/content" required="false"/> + <option opt="db" hasArg="true" + description="name of db directory or file to which to write results"/> + </commandline> + + + <!-- + Can also add startDir: this tells the crawler to start indexing a + child directory of the inputDir directory. + --> + <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder" + inputDir="src/test/resources/test-dirs/extractsA" + crawlOrder="sorted" + maxConsecWaitMillis="5000" + maxFilesToAdd="-1" + maxFilesToConsider="-1" + includeFilePat="" + excludeFilePat="" + maxFileSizeBytes="-1" + /> + + <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder" + consumerBuilderClass="org.apache.tika.eval.batch.SingleFileConsumerBuilder" + errorLogFile="src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml" + extractDir="src/test/resources/test-dirs/extractsA" + commonTokens="src/test/resources/common_tokens_short.txt"/> + + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000" + staleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> +</tika-batch-config> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml b/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml new file mode 100644 index 0000000..da59d03 --- /dev/null +++ b/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml @@ -0,0 +1,73 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<tika-batch-config + maxAliveTimeSeconds="-1" + pauseOnEarlyTerminationMillis="500" + timeoutCheckPulseMillis="1000" + maxQueueSize="10000" + numConsumers="5" + timeoutThresholdMillis="300000"> + + <commandline> + <option opt="c" longOpt="tika-config" hasArg="true" + description="TikaConfig file"/> + + <option opt="bc" longOpt="batch-config" hasArg="true" + description="xml batch config file" required="true"/> + <option opt="inputDir" hasArg="true" + description="dir to start crawling"/> + <option opt="numConsumers" hasArg="true" + description="number of fileConsumers threads"/> + <option opt="extractDir" hasArg="true" + description="this dir contains the files containing extracted metadata/content" required="false"/> + <option opt="db" hasArg="true" + description="name of db directory or file to which to write results"/> + </commandline> + + + <!-- + Can also add startDir: this tells the crawler to start indexing a + child directory of the inputDir directory. + --> + <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder" + inputDir="src/test/resources/test-dirs/raw_input" + crawlOrder="sorted" + maxConsecWaitMillis="5000" + maxFilesToAdd="-1" + maxFilesToConsider="-1" + includeFilePat="" + excludeFilePat="" + maxFileSizeBytes="-1" + /> + + <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder" + consumerBuilderClass="org.apache.tika.eval.batch.SingleFileConsumerBuilder" + errorLogFile="src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml" + extractDir="src/test/resources/test-dirs/extractsA" + inputDir="src/test/resources/test-dirs/raw_input" + commonTokens="src/test/resources/common_tokens"/> + + + <!-- reporter and interrupter are optional --> + <reporter builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" sleepMillis="1000" + staleThresholdMillis="500000"/> + <interrupter builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/> +</tika-batch-config> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/5e49c330/tika-eval/src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml ---------------------------------------------------------------------- diff --git a/tika-eval/src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml b/tika-eval/src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml new file mode 100644 index 0000000..520306b --- /dev/null +++ b/tika-eval/src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml @@ -0,0 +1,59 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<log4j:event logger="org.apache.tika.batch.FileResourceConsumer" timestamp="1436376775762" level="ERROR" thread="pool-2-thread-11"> +<log4j:message><![CDATA[<?xml version="1.0" ?><timed_out resourceId="file10_permahang.txt" elapsedMS="340302"></timed_out>]]></log4j:message> +</log4j:event> + +<log4j:event logger="org.apache.tika.batch.FileResourceConsumer" timestamp="1436376775758" level="ERROR" thread="pool-2-thread-10"> +<log4j:message><![CDATA[<?xml version="1.0" ?><oom resourceId="file11_oom.txt">java.lang.OutOfMemoryError: Java heap space + at java.io.ByteArrayOutputStream.<init>(ByteArrayOutputStream.java:77) + at org.apache.fontbox.ttf.MemoryTTFDataStream.<init>(MemoryTTFDataStream.java:45) + at org.apache.fontbox.ttf.TTFParser.parse(TTFParser.java:96) + at org.apache.pdfbox.pdmodel.font.PDTrueTypeFont.<init>(PDTrueTypeFont.java:135) + at org.apache.pdfbox.pdmodel.font.PDFontFactory.createFont(PDFontFactory.java:75) + at org.apache.pdfbox.pdmodel.PDResources.getFont(PDResources.java:96) + at org.apache.pdfbox.contentstream.operator.text.SetFontAndSize.process(SetFontAndSize.java:50) + at org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:795) + at org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:462) + at org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:438) + at org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:178) + at org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:49) + at org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:795) + at org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:462) + at org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:438) + at org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:149) + at org.apache.pdfbox.text.PDFTextStreamEngine.processPage(PDFTextStreamEngine.java:117) + at org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:369) + at org.apache.pdfbox.text.PDFTextStripper.processPages(PDFTextStripper.java:305) + at org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:249) + at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:137) + at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:132) + at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:281) + at org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:281) + at org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120) + at org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:177) + at org.apache.tika.parser.DigestingParser.parse(DigestingParser.java:74) + at org.apache.tika.parser.RecursiveParserWrapper.parse(RecursiveParserWrapper.java:158) + at org.apache.tika.batch.FileResourceConsumer.parse(FileResourceConsumer.java:410) + at org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer.processFileResource(RecursiveParserWrapperFSConsumer.java:104) + at org.apache.tika.batch.FileResourceConsumer._processFileResource(FileResourceConsumer.java:182) + at org.apache.tika.batch.FileResourceConsumer.call(FileResourceConsumer.java:115) +</oom>]]></log4j:message> +</log4j:event> \ No newline at end of file