[2/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

tallison Thu, 16 Feb 2017 09:19:17 -0800

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java
new file mode 100644
index 0000000..0d925cf
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java
@@ -0,0 +1,411 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.nio.file.FileSystems;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.ResultSetMetaData;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.batch.fs.FSBatchTestBase;
+import org.apache.tika.eval.db.Cols;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+
+@Ignore("need to fix tika-batch tests to make this work")
+public class ComparerBatchTest extends FSBatchTestBase {
+
+    public final static String COMPARER_PROCESS_CLASS = 
"org.apache.tika.batch.fs.FSBatchProcessCLI";
+
+    private static Path dbDir;
+    private static Connection conn;
+
+    private final static String compJoinCont = "";
+    /*ExtractComparer.COMPARISONS_TABLE+" cmp " +
+            "join "+ExtractComparer.CONTAINERS_TABLE + " cnt "+
+            "on cmp."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID+
+            " = cnt."+AbstractProfiler.CONTAINER_HEADERS.CONTAINER_ID;*/
+
+    @BeforeClass
+    public static void setUp() throws Exception {
+
+        File inputRoot = new 
File(ComparerBatchTest.class.getResource("/test-dirs").toURI());
+        dbDir = Files.createTempDirectory(inputRoot.toPath(), 
"tika-test-db-dir-");
+        Map<String, String> args = new HashMap<>();
+        Path db = FileSystems.getDefault().getPath(dbDir.toString(), 
"comparisons_test");
+        args.put("-db", db.toString());
+
+        //for debugging, you can use this to select only one file pair to load
+        //args.put("-includeFilePat", "file8.*");
+/*
+        BatchProcessTestExecutor ex = new 
BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args,
+                "/tika-batch-comparison-eval-config.xml");
+        StreamStrings streamStrings = ex.execute();
+        System.out.println(streamStrings.getErrString());
+        System.out.println(streamStrings.getOutString());
+        H2Util dbUtil = new H2Util(db);
+        conn = dbUtil.getConnection();*/
+    }
+
+    @AfterClass
+    public static void tearDown() throws Exception {
+
+        conn.close();
+
+        FileUtils.deleteDirectory(dbDir.toFile());
+    }
+
+
+    @Test
+    public void testSimpleDBWriteAndRead() throws Exception {
+        Set<String> set = new HashSet<>();
+        //filenames
+        List<String> list = getColStrings(Cols.FILE_NAME.name(),
+                ExtractComparer.PROFILES_A.getName(), "");
+        assertEquals(7, list.size());
+        assertTrue(list.contains("file1.pdf"));
+
+        //container ids in comparisons table
+        list = getColStrings(Cols.CONTAINER_ID.name(),
+                ExtractComparer.COMPARISON_CONTAINERS.getName(),"");
+        assertEquals(10, list.size());
+        set.clear(); set.addAll(list);
+        assertEquals(10, set.size());
+/*
+        //ids in comparisons table
+        list = getColStrings(AbstractProfiler.HEADERS.ID.name(),
+                compTable,"");
+        assertEquals(9, list.size());
+        set.clear(); set.addAll(list);
+        assertEquals(9, set.size());*/
+    }
+
+
+
+    /*
+        @Test
+        public void testFile1PDFRow() throws Exception {
+            String where = fp+"='file1.pdf'";
+            Map<String, String> data = getRow(compJoinCont, where);
+            String result = 
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS + "_A");
+            assertTrue(result.startsWith("over: 1"));
+
+            result = 
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS + "_B");
+            assertTrue(result.startsWith("aardvark: 3 | bear: 2"));
+
+
+            assertEquals("aardvark: 3 | bear: 2",
+                    
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.toString()));
+            assertEquals("fox: 2 | lazy: 1 | over: 1",
+                    
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.toString()));
+            assertEquals("12", 
data.get(ExtractComparer.HEADERS.NUM_TOKENS+"_A"));
+            assertEquals("13", 
data.get(ExtractComparer.HEADERS.NUM_TOKENS+"_B"));
+            assertEquals("8", 
data.get(ExtractComparer.HEADERS.NUM_UNIQUE_TOKENS+"_A"));
+            assertEquals("9", 
data.get(ExtractComparer.HEADERS.NUM_UNIQUE_TOKENS+"_B"));
+
+            assertEquals(ExtractComparer.COMPARISON_HEADERS.OVERLAP.name(),
+                    0.64f, Float.parseFloat(data.get("OVERLAP")), 0.0001f);
+
+            
assertEquals(ExtractComparer.COMPARISON_HEADERS.DICE_COEFFICIENT.name(),
+                    0.8235294f, 
Float.parseFloat(data.get("DICE_COEFFICIENT")), 0.0001f);
+
+            assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_A", 
3.83333d,
+                    Double.parseDouble(
+                            
data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_A")), 0.0001d);
+
+            assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_B", 
4.923d,
+                    Double.parseDouble(
+                            
data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_MEAN+"_B")), 0.0001d);
+
+            assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_A", 
1.0298d,
+                    Double.parseDouble(
+                            
data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_A")), 0.0001d);
+
+            assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_B", 
1.9774d,
+                    
Double.parseDouble(data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_STD_DEV+"_B")),
 0.0001d);
+
+            assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_A", 46,
+                    Integer.parseInt(
+                            
data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_A")));
+
+            assertEquals(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_B", 64,
+                    
Integer.parseInt(data.get(ExtractComparer.HEADERS.TOKEN_LENGTH_SUM+"_B")));
+
+            assertEquals("TOKEN_ENTROPY_RATE_A", 0.237949,
+                    Double.parseDouble(data.get("TOKEN_ENTROPY_RATE_A")), 
0.0001d);
+
+            assertEquals("TOKEN_ENTROPY_RATE_B", 0.232845,
+                    Double.parseDouble(data.get("TOKEN_ENTROPY_RATE_B")), 
0.0001d);
+
+        }
+
+
+        @Test
+        public void testEmpty() throws Exception {
+            String where = fp+"='file4_emptyB.pdf'";
+            Map<String, String> data = getRow(contTable, where);
+            assertNull(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX +
+                    ExtractComparer.aExtension));
+            assertTrue(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX +
+                    
ExtractComparer.bExtension).equals(AbstractProfiler.JSON_PARSE_EXCEPTION));
+
+            where = fp+"='file5_emptyA.pdf'";
+            data = getRow(contTable, where);
+            assertNull(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX +
+                    ExtractComparer.bExtension));
+            assertTrue(data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+
+                    
ExtractComparer.aExtension).equals(AbstractProfiler.JSON_PARSE_EXCEPTION));
+        }
+
+            @Test
+            public void testMissingAttachment() throws Exception {
+                String where = fp+"='file2_attachANotB.doc' and 
"+AbstractProfiler.HEADERS.EMBEDDED_FILE_PATH+
+                        "='inner.txt'";
+                Map<String, String> data = getRow(compJoinCont, where);
+                assertContains("attachment: 1", 
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.name()));
+                assertNotContained("fox", 
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.name()));
+                assertNull(data.get(ExtractComparer.HEADERS.TOP_N_TOKENS +
+                        ExtractComparer.bExtension));
+                assertNotContained("fox", 
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS +
+                        ExtractComparer.bExtension));
+
+                assertEquals("3", data.get("NUM_METADATA_VALUES_A"));
+                assertNull(data.get("DIFF_NUM_ATTACHMENTS"));
+                assertNull(data.get("NUM_METADATA_VALUES_B"));
+                assertEquals("0", data.get("NUM_UNIQUE_TOKENS_B"));
+                assertNull(data.get("TOKEN_ENTROPY_RATE_B"));
+                assertNull(data.get("NUM_EN_STOPS_TOP_N_B"));
+
+                where = fp+"='file3_attachBNotA.doc' and 
"+AbstractProfiler.HEADERS.EMBEDDED_FILE_PATH+
+                        "='inner.txt'";
+                data = getRow(compJoinCont, where);
+                assertContains("attachment: 1", 
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_B.name()));
+                assertNotContained("fox", 
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_MORE_IN_A.name()));
+                assertNull(data.get(ExtractComparer.HEADERS.TOP_N_TOKENS +
+                        ExtractComparer.aExtension));
+                assertNotContained("fox", 
data.get(ExtractComparer.COMPARISON_HEADERS.TOP_10_UNIQUE_TOKEN_DIFFS +
+                        ExtractComparer.aExtension));
+
+                assertEquals("3", data.get("NUM_METADATA_VALUES_B"));
+                assertNull(data.get("DIFF_NUM_ATTACHMENTS"));
+                assertNull(data.get("NUM_METADATA_VALUES_A"));
+                assertEquals("0", data.get("NUM_UNIQUE_TOKENS_A"));
+                assertNull(data.get("TOKEN_ENTROPY_RATE_A"));
+                assertNull(data.get("NUM_EN_STOPS_TOP_N_A"));
+
+            }
+
+            @Test
+            public void testBothBadJson() throws Exception {
+                debugDumpAll(contTable);
+                String where = fp+"='file7_badJson.pdf'";
+                Map<String, String> data = getRow(contTable, where);
+                assertEquals(AbstractProfiler.JSON_PARSE_EXCEPTION,
+                        data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+ 
ExtractComparer.aExtension));
+                assertEquals(AbstractProfiler.JSON_PARSE_EXCEPTION,
+                        data.get(AbstractProfiler.CONTAINER_HEADERS.JSON_EX+ 
ExtractComparer.bExtension));
+                assertEquals("file7_badJson.pdf",
+                        
data.get(AbstractProfiler.CONTAINER_HEADERS.FILE_PATH.name()));
+                assertEquals("61", data.get("JSON_FILE_LENGTH_A"));
+                assertEquals("0", data.get("JSON_FILE_LENGTH_B"));
+                assertEquals("pdf", 
data.get(AbstractProfiler.CONTAINER_HEADERS.FILE_EXTENSION.name()));
+
+            }
+
+            @Test
+            public void testAccessPermissionException() throws Exception {
+                String sql = "select "+
+                        
AbstractProfiler.EXCEPTION_HEADERS.ACCESS_PERMISSION_EXCEPTION.name() +
+                        " from " + AbstractProfiler.EXCEPTIONS_TABLE+"_A exA "+
+                        " join " + ExtractComparer.COMPARISONS_TABLE + " cmp 
on cmp.ID=exA.ID "+
+                        " join " + ExtractComparer.CONTAINERS_TABLE + " cont 
on cmp.CONTAINER_ID=cont.CONTAINER_ID "+
+                        " where "+fp+"='file6_accessEx.pdf'";
+                Statement st = conn.createStatement();
+                ResultSet rs = st.executeQuery(sql);
+                List<String> results = new ArrayList<String>();
+                while (rs.next()) {
+                    results.add(rs.getString(1));
+                }
+                assertEquals(1, results.size());
+                assertEquals("TRUE", results.get(0));
+
+                sql = "select "+
+                        
AbstractProfiler.EXCEPTION_HEADERS.ACCESS_PERMISSION_EXCEPTION.name() +
+                        " from " + AbstractProfiler.EXCEPTIONS_TABLE+"_B exB "+
+                        " join " + ExtractComparer.COMPARISONS_TABLE + " cmp 
on cmp.ID=exB.ID "+
+                        " join " + ExtractComparer.CONTAINERS_TABLE + " cont 
on cmp.CONTAINER_ID=cont.CONTAINER_ID "+
+                        " where "+fp+"='file6_accessEx.pdf'";
+                st = conn.createStatement();
+                rs = st.executeQuery(sql);
+                results = new ArrayList<String>();
+                while (rs.next()) {
+                    results.add(rs.getString(1));
+                }
+                assertEquals(1, results.size());
+                assertEquals("TRUE", results.get(0));
+
+            }
+
+            @Test
+            public void testContainerException() throws Exception {
+                String sql = "select * "+
+                        " from " + AbstractProfiler.EXCEPTIONS_TABLE+"_A exA "+
+                        " join " + ExtractComparer.COMPARISONS_TABLE + " cmp 
on cmp.ID=exA.ID "+
+                        " join " + ExtractComparer.CONTAINERS_TABLE + " cont 
on cmp.CONTAINER_ID=cont.CONTAINER_ID "+
+                        "where "+fp+"='file8_IOEx.pdf'";
+                Statement st = conn.createStatement();
+                ResultSet rs = st.executeQuery(sql);
+
+                Map<String, String> data = new HashMap<String,String>();
+                ResultSetMetaData rsM = rs.getMetaData();
+                while (rs.next()) {
+                    for (int i = 1; i <= rsM.getColumnCount(); i++)
+                    data.put(rsM.getColumnName(i), rs.getString(i));
+                }
+
+                String sortStack = 
data.get(AbstractProfiler.EXCEPTION_HEADERS.SORT_STACK_TRACE.name());
+                sortStack = sortStack.replaceAll("[\r\n]", "<N>");
+                
assertTrue(sortStack.startsWith("java.lang.RuntimeException<N>"));
+
+                String fullStack = 
data.get(AbstractProfiler.EXCEPTION_HEADERS.ORIG_STACK_TRACE.name());
+                assertTrue(
+                        fullStack.startsWith("java.lang.RuntimeException: 
java.io.IOException: Value is not an integer"));
+            }
+
+        private void debugDumpAll(String table) throws Exception {
+            Statement st = conn.createStatement();
+            String sql = "select * from "+table;
+            ResultSet rs = st.executeQuery(sql);
+            ResultSetMetaData m = rs.getMetaData();
+            for (int i = 1; i <= m.getColumnCount(); i++) {
+                System.out.print(m.getColumnName(i) + ", ");
+            }
+            System.out.println("\n");
+            while (rs.next()) {
+                for (int i = 1; i <= m.getColumnCount(); i++) {
+                    System.out.print(rs.getString(i)+", ");
+                }
+                System.out.println("\n");
+            }
+            st.close();
+
+        }
+        */
+    private void debugShowColumns(String table) throws Exception {
+        Statement st = conn.createStatement();
+        String sql = "select * from "+table;
+        ResultSet rs = st.executeQuery(sql);
+        ResultSetMetaData m = rs.getMetaData();
+        for (int i = 1; i <= m.getColumnCount(); i++) {
+            System.out.println(i+" : "+m.getColumnName(i));
+        }
+        st.close();
+    }
+
+    //return the string value for one cell
+    private String getString(String colName, String table, String where) 
throws Exception {
+        List<String> results = getColStrings(colName, table, where);
+        if (results.size() > 1) {
+            throw new RuntimeException("more than one result");
+        } else if (results.size() == 0) {
+            throw new RuntimeException("no results");
+        }
+
+        return results.get(0);
+    }
+
+
+    private Map<String, String> getRow(String table, String where) throws 
Exception {
+        String sql = getSql("*", table, where);
+        Map<String, String> results = new HashMap<String, String>();
+        Statement st = null;
+
+        try {
+            st = conn.createStatement();
+            ResultSet rs = st.executeQuery(sql);
+            ResultSetMetaData m = rs.getMetaData();
+            int rows = 0;
+            while (rs.next()) {
+                if (rows > 0) {
+                    throw new RuntimeException("returned more than one row!");
+                }
+                for (int i = 1; i <= m.getColumnCount(); i++) {
+                    results.put(m.getColumnName(i), rs.getString(i));
+                }
+                rows++;
+            }
+        } finally {
+            if (st != null) {
+                st.close();
+            }
+        }
+        return results;
+
+    }
+
+    //return the string representations of the column values for one column
+    //as a list of strings
+    private List<String> getColStrings(String colName) throws Exception {
+        return getColStrings(colName, 
ExtractComparer.CONTENT_COMPARISONS.getName(), null);
+    }
+
+    private List<String> getColStrings(String colName, String table, String 
where) throws Exception {
+        String sql = getSql(colName, table, where);
+        List<String> results = new ArrayList<>();
+        Statement st = null;
+        try {
+            st = conn.createStatement();
+            System.out.println("SQL: "+sql);
+            ResultSet rs = st.executeQuery(sql);
+            while (rs.next()) {
+                results.add(rs.getString(1));
+            }
+        } finally {
+            if (st != null) {
+                st.close();
+            }
+        }
+        return results;
+    }
+
+    private String getSql(String colName, String table, String where) {
+        StringBuilder sb = new StringBuilder();
+        sb.append("select ").append(colName).append(" from ").append(table);
+        if (where != null && ! where.equals("")) {
+            sb.append(" where ").append(where);
+        }
+        return sb.toString();
+    }
+
+}


http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/ProfilerBatchTest.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/ProfilerBatchTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/ProfilerBatchTest.java
new file mode 100644
index 0000000..c3f0a7e
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/ProfilerBatchTest.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.H2Util;
+import org.apache.tika.eval.db.TableInfo;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Ignore;
+import org.junit.Test;
+@Ignore
+public class ProfilerBatchTest {
+
+    public final static String COMPARER_PROCESS_CLASS = 
"org.apache.tika.batch.fs.FSBatchProcessCLI";
+
+    private static Path dbDir;
+    private static Connection conn;
+
+    private final static String profileTable = 
ExtractProfiler.PROFILE_TABLE.getName();
+    private final static String exTable = 
ExtractProfiler.EXCEPTION_TABLE.getName();
+    private final static String fpCol = Cols.FILE_PATH.name();
+
+    @BeforeClass
+    public static void setUp() throws Exception {
+
+        Path inputRoot = Paths.get(new 
ComparerBatchTest().getClass().getResource("/test-dirs/extractsA").toURI());
+        dbDir = Files.createTempDirectory(inputRoot, "tika-test-db-dir-");
+        Map<String, String> args = new HashMap<>();
+        Path db = dbDir.resolve("profiler_test");
+        args.put("-db", db.toString());
+
+        //for debugging, you can use this to select only one file pair to load
+        //args.put("-includeFilePat", "file8.*");
+
+       /* BatchProcessTestExecutor ex = new 
BatchProcessTestExecutor(COMPARER_PROCESS_CLASS, args,
+                "/single-file-profiler-crawl-input-config.xml");
+        StreamStrings streamStrings = ex.execute();
+        System.out.println(streamStrings.getErrString());
+        System.out.println(streamStrings.getOutString());*/
+        H2Util dbUtil = new H2Util(db);
+        conn = dbUtil.getConnection(true);
+    }
+    @AfterClass
+    public static void tearDown() throws IOException {
+
+        try{
+            conn.close();
+        } catch (SQLException e) {
+            throw new RuntimeException(e);
+        }
+
+
+        DirectoryStream<Path> dStream = Files.newDirectoryStream(dbDir);
+        Iterator<Path> it = dStream.iterator();
+        while (it.hasNext()) {
+            Path p = it.next();
+            Files.delete(p);
+        }
+        dStream.close();
+        Files.delete(dbDir);
+    }
+
+    @Test
+    public void testSimpleDBWriteAndRead() throws Exception {
+
+        Statement st = null;
+        List<String> fNameList = new ArrayList<>();
+        try {
+            String sql = "select * from "+ 
ExtractProfiler.CONTAINER_TABLE.getName();
+            st = conn.createStatement();
+            ResultSet rs = st.executeQuery(sql);
+            while (rs.next()) {
+                String fileName = rs.getString(Cols.FILE_PATH.name());
+                fNameList.add(fileName);
+            }
+        } finally {
+            if (st != null) {
+                st.close();
+            }
+        }
+        debugTable(ExtractProfiler.CONTAINER_TABLE);
+        debugTable(ExtractProfiler.PROFILE_TABLE);
+        debugTable(ExtractProfiler.CONTENTS_TABLE);
+        debugTable(ExtractProfiler.EXCEPTION_TABLE);
+        debugTable(ExtractProfiler.ERROR_TABLE);
+        assertEquals(10, fNameList.size());
+        assertTrue("file1.pdf", fNameList.contains("file1.pdf"));
+        assertTrue("file2_attachANotB.doc", 
fNameList.contains("file2_attachANotB.doc"));
+        assertTrue("file3_attachBNotA.doc", 
fNameList.contains("file3_attachBNotA.doc"));
+        assertTrue("file4_emptyB.pdf", fNameList.contains("file4_emptyB.pdf"));
+        assertTrue("file7_badJson.pdf", 
fNameList.contains("file7_badJson.pdf"));
+    }
+
+    @Test
+    public void testExtractErrors() throws Exception {
+        String sql = "select EXTRACT_ERROR_TYPE_ID from errors e" +
+                " join containers c on c.container_id = e.container_id "+
+                " where c.file_path='file9_noextract.txt'";
+
+        assertEquals("missing extract: file9_noextract.txt", "0",
+                getSingleResult(sql));
+        debugTable(ExtractProfiler.CONTAINER_TABLE);
+        debugTable(ExtractProfiler.PROFILE_TABLE);
+        debugTable(ExtractProfiler.CONTENTS_TABLE);
+        debugTable(ExtractProfiler.EXCEPTION_TABLE);
+        debugTable(ExtractProfiler.ERROR_TABLE);
+
+        sql = "select EXTRACT_ERROR_TYPE_ID from errors e" +
+                " join containers c on c.container_id = e.container_id "+
+                " where c.file_path='file5_emptyA.pdf'";
+        assertEquals("empty extract: file5_emptyA.pdf", "1",
+                getSingleResult(sql));
+
+        sql = "select EXTRACT_ERROR_TYPE_ID from errors e" +
+                " join containers c on c.container_id = e.container_id "+
+                " where c.file_path='file7_badJson.pdf'";
+        assertEquals("extract error:file7_badJson.pdf", "2",
+                getSingleResult(sql));
+
+    }
+
+    @Test
+    public void testParseErrors() throws Exception {
+        debugTable(ExtractProfiler.ERROR_TABLE);
+        String sql = "select file_path from errors where container_id is null";
+        assertEquals("file10_permahang.txt",
+                getSingleResult(sql));
+
+        sql = "select extract_error_type_id from errors where 
file_path='file11_oom.txt'";
+        assertEquals(Integer.toString(AbstractProfiler.
+                        EXTRACT_ERROR_TYPE.
+                        ZERO_BYTE_EXTRACT_FILE.ordinal()),
+                getSingleResult(sql));
+
+        sql = "select parse_error_type_id from errors where 
file_path='file11_oom.txt'";
+        assertEquals(Integer.toString(AbstractProfiler.
+                        PARSE_ERROR_TYPE.
+                        OOM.ordinal()),
+                getSingleResult(sql));
+
+    }
+
+    @Test
+    public void testParseExceptions() throws Exception {
+        debugTable(ExtractProfiler.EXCEPTION_TABLE);
+    }
+
+    private String getSingleResult(String sql) throws Exception {
+        Statement st = null;
+        st = conn.createStatement();
+        ResultSet rs = st.executeQuery(sql);
+        int hits = 0;
+        String val = "";
+        while (rs.next()) {
+            assertEquals("must have only one column in result",
+                    1, rs.getMetaData().getColumnCount());
+            val = rs.getString(1);
+            hits++;
+        }
+        assertEquals("must have only one hit", 1, hits);
+        return val;
+    }
+
+    //TODO: lots more testing!
+
+    public void debugTable(TableInfo table) throws Exception {
+        Statement st = null;
+        try {
+            String sql = "select * from "+table.getName();
+            st = conn.createStatement();
+            ResultSet rs = st.executeQuery(sql);
+            int colCount = rs.getMetaData().getColumnCount();
+            System.out.println("TABLE: "+table.getName());
+            for (int i = 1; i <= colCount; i++) {
+                if (i > 1) {
+                    System.out.print(" | ");
+                }
+                System.out.print(rs.getMetaData().getColumnName(i));
+            }
+            System.out.println("");
+            int rowCount = 0;
+            while (rs.next()) {
+                for (int i = 1; i <= colCount; i++) {
+                    if (i > 1) {
+                        System.out.print(" | ");
+                    }
+                    System.out.print(rs.getString(i));
+                    rowCount++;
+                }
+                System.out.println("");
+            }
+            if (rowCount == 0) {
+                System.out.println(table.getName() + " was empty");
+            }
+        } finally {
+            if (st != null) {
+                st.close();
+            }
+        }
+
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
new file mode 100644
index 0000000..72e8008
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval;
+
+import static org.apache.tika.eval.AbstractProfiler.EXCEPTION_TYPE;
+import static org.apache.tika.eval.AbstractProfiler.EXTRACT_ERROR_TYPE;
+import static org.apache.tika.eval.AbstractProfiler.getContent;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedSet;
+import java.util.TreeSet;
+
+import org.apache.tika.MockDBWriter;
+import org.apache.tika.TikaTest;
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.TableInfo;
+import org.apache.tika.eval.io.ExtractReader;
+import org.apache.tika.eval.util.LanguageIDWrapper;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+
+//These tests ensure that the comparer is extracting the right information
+//into a Map<String,String>.  A full integration test
+//should also ensure that the elements are properly being written to the db
+
+public class SimpleComparerTest extends TikaTest {
+
+    private ExtractComparer comparer = null;
+    private MockDBWriter writer = null;
+
+    @Before
+    public void setUp() throws Exception {
+        writer = new MockDBWriter();
+        comparer = new ExtractComparer(null, null,
+                Paths.get("extractsA"), Paths.get("extractsB"),
+                writer, -1, -1,
+                ExtractReader.ALTER_METADATA_LIST.AS_IS);
+        
AbstractProfiler.loadCommonTokens(this.getResourceAsFile("/commontokens").toPath());
+        LanguageIDWrapper.loadBuiltInModels();
+    }
+
+    @Test
+    public void testBasic() throws Exception {
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file1.pdf.json"),
+                
getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file1.pdf.json"),
+                
getResourceAsFile("/test-dirs/extractsB/file1.pdf.json").toPath());
+
+        comparer.compareFiles(fpsA, fpsB);
+
+        List<Map<Cols, String>> tableInfos = 
writer.getTable(ExtractComparer.CONTENT_COMPARISONS);
+        Map<Cols, String> row = tableInfos.get(0);
+        assertEquals("0", row.get(Cols.ID));
+        assertTrue(
+                row.get(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A)
+                        .startsWith("1,200: 1 | 120000: 1 | over: 1"));
+
+        tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+        row = tableInfos.get(0);
+        assertEquals("0", row.get(Cols.ID));
+        assertEquals("70", row.get(Cols.CONTENT_LENGTH));
+        assertEquals("10", row.get(Cols.NUM_UNIQUE_TOKENS));
+        assertEquals("14", row.get(Cols.NUM_TOKENS));
+        assertEquals("12", row.get(Cols.NUM_ALPHABETIC_TOKENS));
+        assertEquals("6", row.get(Cols.NUM_COMMON_TOKENS));
+        assertEquals("57", row.get(Cols.TOKEN_LENGTH_SUM));
+        assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
+
+        tableInfos = writer.getTable(ExtractComparer.CONTENTS_TABLE_B);
+        row = tableInfos.get(0);
+        assertEquals("0", row.get(Cols.ID));
+        assertEquals("76", row.get(Cols.CONTENT_LENGTH));
+        assertEquals("9", row.get(Cols.NUM_UNIQUE_TOKENS));
+        assertEquals("13", row.get(Cols.NUM_TOKENS));
+        assertEquals("4", row.get(Cols.NUM_COMMON_TOKENS));
+        assertEquals("64", row.get(Cols.TOKEN_LENGTH_SUM));
+        assertEquals("en", row.get(Cols.COMMON_TOKENS_LANG));
+
+        tableInfos = writer.getTable(ExtractComparer.PROFILES_A);
+        row = tableInfos.get(0);
+        assertEquals("2", row.get(Cols.NUM_PAGES));
+
+    }
+
+    @Test
+    public void testBasicSpanish() throws Exception {
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file1.pdf.json"),
+                
getResourceAsFile("/test-dirs/extractsA/file12_es.txt.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file1.pdf.json"),
+                
getResourceAsFile("/test-dirs/extractsB/file12_es.txt.json").toPath());
+
+        comparer.compareFiles(fpsA, fpsB);
+
+        List<Map<Cols, String>> tableInfos = 
writer.getTable(ExtractComparer.CONTENTS_TABLE_A);
+
+        Map<Cols, String> row = tableInfos.get(0);
+        assertEquals("133", row.get(Cols.CONTENT_LENGTH));
+        assertEquals("7", row.get(Cols.NUM_UNIQUE_TOKENS));
+        assertEquals("24", row.get(Cols.NUM_TOKENS));
+        assertEquals("3", row.get(Cols.NUM_COMMON_TOKENS));
+        assertEquals("108", row.get(Cols.TOKEN_LENGTH_SUM));
+        assertEquals("es", row.get(Cols.COMMON_TOKENS_LANG));
+        assertEquals("24", row.get(Cols.NUM_ALPHABETIC_TOKENS));
+
+    }
+
+
+    @Test
+    public void testEmpty() throws Exception {
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file1.pdf"),
+                
getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file1.pdf"),
+                
getResourceAsFile("/test-dirs/extractsB/file4_emptyB.pdf.json").toPath()
+        );
+        comparer.compareFiles(fpsA, fpsB);
+        List<Map<Cols, String>> table = 
writer.getTable(ExtractComparer.ERROR_TABLE_B);
+        Map<Cols, String> row = table.get(0);
+        //debugPrintRow(row);
+        
assertEquals(Integer.toString(EXTRACT_ERROR_TYPE.ZERO_BYTE_EXTRACT_FILE.ordinal()),
+                row.get(Cols.EXTRACT_ERROR_TYPE_ID));
+    }
+
+
+    @Test
+    public void testGetContent() throws Exception {
+        Metadata m = new Metadata();
+        m.add(RecursiveParserWrapper.TIKA_CONTENT, "0123456789");
+
+        String content = getContent(m, 10);
+        assertEquals(10, content.length());
+
+        content = getContent(m, 4);
+        assertEquals(4, content.length());
+
+        //test Metadata with no content
+        content = getContent(new Metadata(), 10);
+        assertEquals(0, content.length());
+
+        //test null Metadata
+        content = getContent(null, 10);
+        assertEquals(0, content.length());
+    }
+
+    @Test
+    public void testAccessException() throws Exception {
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file6_accessEx.pdf.json"),
+                
getResourceAsFile("/test-dirs/extractsA/file6_accessEx.pdf.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file6_accessEx.pdf.json"),
+                
getResourceAsFile("/test-dirs/extractsB/file6_accessEx.pdf.json").toPath()
+        );
+        comparer.compareFiles(fpsA, fpsB);
+        for (TableInfo t : new TableInfo[]{ExtractComparer.EXCEPTION_TABLE_A, 
ExtractComparer.EXCEPTION_TABLE_B}) {
+            List<Map<Cols, String>> table = writer.getTable(t);
+
+            Map<Cols, String> rowA = table.get(0);
+            //debugPrintRow(rowA);
+            
assertEquals(Integer.toString(EXCEPTION_TYPE.ACCESS_PERMISSION.ordinal()),
+                    rowA.get(Cols.PARSE_EXCEPTION_TYPE_ID));
+            assertNull(rowA.get(Cols.ORIG_STACK_TRACE));
+            assertNull(rowA.get(Cols.SORT_STACK_TRACE));
+        }
+    }
+
+
+    @Test
+    public void testAttachmentCounts() {
+        List<Metadata> list = new ArrayList<>();
+        Metadata m0 = new Metadata();
+        m0.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, 
"dir1/dir2/file.zip");//bad data should be ignored
+                                                                               
     //in the first metadata object
+        list.add(m0);
+        Metadata m1 = new Metadata();
+        m1.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, 
"/f1.docx/f2.zip/text1.txt");
+        list.add(m1);
+        Metadata m2 = new Metadata();
+        m2.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, 
"/f1.docx/f2.zip/text2.txt");
+        list.add(m2);
+        Metadata m3 = new Metadata();
+        m3.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, 
"/f1.docx/f2.zip");
+        list.add(m3);
+        Metadata m4 = new Metadata();
+        m4.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, "/f1.docx");
+        list.add(m4);
+        Metadata m5 = new Metadata();
+        m5.set(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH, 
"/f1.docx/text3.txt");
+        list.add(m5);
+
+        List<Integer> counts = AbstractProfiler.countAttachments(list);
+
+        List<Integer> expected = new ArrayList<>();
+        expected.add(5);
+        expected.add(0);
+        expected.add(0);
+        expected.add(2);
+        expected.add(4);
+        expected.add(0);
+        assertEquals(expected, counts);
+    }
+
+
+    @Test
+    @Ignore
+    public void testDebug() throws Exception {
+        Path commonTokens = 
Paths.get(getResourceAsFile("/common_tokens_short.txt").toURI());
+        AbstractProfiler.loadCommonTokens(commonTokens);
+        EvalFilePaths fpsA = new EvalFilePaths(
+                Paths.get("file1.pdf.json"),
+                
getResourceAsFile("/test-dirs/extractsA/file1.pdf.json").toPath()
+        );
+        EvalFilePaths fpsB = new EvalFilePaths(
+                Paths.get("file1.pdf.json"),
+                
getResourceAsFile("/test-dirs/extractsB/file1.pdf.json").toPath()
+        );
+        comparer.compareFiles(fpsA, fpsB);
+        for (TableInfo t : new TableInfo[]{
+                ExtractComparer.COMPARISON_CONTAINERS,
+                ExtractComparer.ERROR_TABLE_A,
+                ExtractComparer.ERROR_TABLE_B,
+                ExtractComparer.EXCEPTION_TABLE_A,
+                ExtractComparer.EXCEPTION_TABLE_B,
+                ExtractComparer.PROFILES_A,
+                ExtractComparer.PROFILES_B,
+                ExtractComparer.CONTENTS_TABLE_A,
+                ExtractComparer.CONTENTS_TABLE_B,
+                ExtractComparer.CONTENT_COMPARISONS}) {
+            //debugPrintTable(t);
+        }
+    }
+
+    private void debugPrintTable(TableInfo tableInfo) {
+        List<Map<Cols, String>> table = writer.getTable(tableInfo);
+        if (table == null) {
+            return;
+        }
+        int i = 0;
+        System.out.println("TABLE: "+tableInfo.getName());
+        for (Map<Cols, String> row : table) {
+            SortedSet<Cols> keys = new TreeSet<Cols>(row.keySet());
+            for (Cols key : keys) {
+                System.out.println( i + " :: " + key + " : " + row.get(key));
+            }
+            i++;
+        }
+        System.out.println("");
+    }
+
+    private void debugPrintRow(Map<Cols, String> row) {
+        SortedSet<Cols> keys = new TreeSet<Cols>(row.keySet());
+        for (Cols key : keys) {
+            System.out.println(key + " : " + row.get(key));
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
new file mode 100644
index 0000000..c358149
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class TikaEvalCLITest {
+
+    @Test
+    @Ignore("TODO: add real tests")
+    public void testBasic() throws Exception {
+        List<String> args = new ArrayList<>();
+        args.add("Profile");
+        args.add("-extractDir");
+        args.add("tika");
+        args.add("-db");
+        args.add("mydb");
+        args.add("-alterExtract");
+        args.add("first_only");
+        TikaEvalCLI.main(args.toArray(new String[args.size()]));
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java
new file mode 100644
index 0000000..810425b
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.db;
+
+
+import static org.junit.Assert.assertEquals;
+
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeUnit;
+
+import org.junit.Test;
+
+public class AbstractBufferTest {
+
+
+    @Test(timeout = 30000)
+    public void runTest() throws InterruptedException, ExecutionException {
+        List<String> keys = new ArrayList<>();
+        Collections.addAll(keys, new String[]{
+                "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"});
+
+        int numGets = 100;
+        int numTesters = 20;
+        AbstractDBBuffer b = new TestBuffer();
+
+
+        ExecutorService ex = Executors.newFixedThreadPool(numTesters);
+        CompletionService<MyTestResult> completionService =
+                new ExecutorCompletionService<>(
+                        ex);
+        for (int i = 0; i < numTesters; i++) {
+            completionService.submit(new Tester(keys, b, numGets));
+        }
+
+        int results = 0;
+        Map<String, Integer> combined = new HashMap<>();
+        while (results < numTesters) {
+            Future<MyTestResult> futureResult =
+                    completionService.poll(1, TimeUnit.SECONDS);
+            if (futureResult != null) {
+                results++;
+                assertEquals(keys.size(), 
futureResult.get().getMap().keySet().size());
+                for (Map.Entry<String, Integer> e : 
futureResult.get().getMap().entrySet()) {
+                    if (!combined.containsKey(e.getKey())) {
+                        combined.put(e.getKey(), e.getValue());
+                    } else {
+                        assertEquals(combined.get(e.getKey()), e.getValue());
+                    }
+                }
+            }
+        }
+        assertEquals(keys.size(), b.getNumWrites());
+    }
+
+    private class Tester implements Callable<MyTestResult> {
+
+        private Random r = new Random();
+        private Map<String, Integer> m = new HashMap<>();
+        List<String> keys = new ArrayList<>();
+        private final AbstractDBBuffer dbBuffer;
+        private final int numGets;
+
+        private Tester(List<String> inputKeys, AbstractDBBuffer buffer, int 
numGets) {
+            keys.addAll(inputKeys);
+            dbBuffer = buffer;
+            this.numGets = numGets;
+        }
+
+        @Override
+        public MyTestResult call() throws Exception {
+
+
+            for (int i = 0; i < numGets; i++) {
+                int index = r.nextInt(keys.size());
+                String k = keys.get(index);
+                if (k == null) {
+                    throw new RuntimeException("keys can't be null");
+                }
+                Integer expected = m.get(k);
+                Integer val = dbBuffer.getId(k);
+                if (val == null) {
+                    throw new RuntimeException("Val can't be null!");
+                }
+                if (expected != null) {
+                    assertEquals(expected, val);
+                }
+                m.put(k, val);
+            }
+
+            //now add the val for every key
+            //just in case the rand() process didn't hit
+            //all indices
+            for (String k : keys) {
+                Integer val = dbBuffer.getId(k);
+                m.put(k, val);
+            }
+            MyTestResult r = new MyTestResult(m);
+            return r;
+        }
+    }
+
+    private class MyTestResult {
+        Map<String, Integer> m;
+        private MyTestResult(Map<String, Integer> m) {
+            this.m = m;
+        }
+        private Map<String, Integer> getMap() {
+            return m;
+        }
+
+        @Override
+        public String toString() {
+            return "MyTester: "+m.size();
+        }
+    }
+
+    private class TestBuffer extends AbstractDBBuffer {
+        @Override
+        public void write(int id, String value) throws RuntimeException {
+            try {
+                Thread.sleep(100);
+            } catch (InterruptedException e) {
+                throw new RuntimeException(e);
+            }
+        }
+
+        @Override
+        public void close() throws SQLException {
+            //no-op
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java
new file mode 100644
index 0000000..14c0013
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java
@@ -0,0 +1,85 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.io;
+
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.List;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.junit.Before;
+import org.junit.Test;
+
+public class ExtractReaderTest extends TikaTest {
+
+    private Path testJsonFile;
+    private Path testTxtFile;
+
+    @Before
+    public void setUp() throws Exception {
+        testJsonFile = 
getResourceAsFile("/test-dirs/extractsA/file2_attachANotB.doc.json").toPath();
+        testTxtFile = 
getResourceAsFile("/test-dirs/extractsB/file13_attachANotB.doc.txt").toPath();
+    }
+
+    @Test
+    public void testBasic() throws Exception {
+
+        ExtractReader extractReader = new ExtractReader();
+        List<Metadata> metadataList = extractReader.loadExtract(testJsonFile,
+                ExtractReader.ALTER_METADATA_LIST.AS_IS);
+        assertEquals(2, metadataList.size());
+        assertEquals(1, 
metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length);
+        assertEquals(1, 
metadataList.get(1).getValues(RecursiveParserWrapper.TIKA_CONTENT).length);
+        assertContains("fox", 
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("attachment", 
metadataList.get(1).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+        metadataList = extractReader.loadExtract(testJsonFile, 
ExtractReader.ALTER_METADATA_LIST.FIRST_ONLY);
+        assertEquals(1, metadataList.size());
+        assertEquals(1, 
metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length);
+        assertContains("fox", 
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertNotContained("attachment", 
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+
+        metadataList = extractReader.loadExtract(testJsonFile, 
ExtractReader.ALTER_METADATA_LIST.CONCATENATE_CONTENT_INTO_FIRST);
+        assertEquals(1, metadataList.size());
+        assertEquals(1, 
metadataList.get(0).getValues(RecursiveParserWrapper.TIKA_CONTENT).length);
+        assertContains("fox", 
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+        assertContains("attachment", 
metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT));
+    }
+
+    @Test
+    public void testTextBasic() throws IOException {
+        ExtractReader extractReader = new ExtractReader();
+        List<Metadata> metadataList = extractReader.loadExtract(testTxtFile,
+                ExtractReader.ALTER_METADATA_LIST.AS_IS);
+        assertEquals(1, metadataList.size());
+        Metadata m = metadataList.get(0);
+        assertEquals(1, 
m.getValues(RecursiveParserWrapper.TIKA_CONTENT).length);
+        assertEquals("the quick brown fox fox fox jumped over the lazy lazy 
dog\n",
+                m.get(RecursiveParserWrapper.TIKA_CONTENT));
+
+        //test that the mime is inferred from the file extension
+        assertEquals("application/msword", m.get(Metadata.CONTENT_TYPE));
+    }
+
+
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/io/FatalExceptionReaderTest.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/io/FatalExceptionReaderTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/io/FatalExceptionReaderTest.java
new file mode 100644
index 0000000..3b99a76
--- /dev/null
+++ 
b/tika-eval/src/test/java/org/apache/tika/eval/io/FatalExceptionReaderTest.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.io;
+
+
+import java.io.InputStream;
+
+import org.junit.Test;
+
+public class FatalExceptionReaderTest {
+    @Test
+    public void testSimpleRead() throws Exception {
+        InputStream is = 
this.getClass().getResourceAsStream("/test-dirs/batch-logs/batch-process-fatal.xml");
+        XMLLogReader reader = new XMLLogReader();
+        //reader.read(is);
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/reports/ResultsReporterTest.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/reports/ResultsReporterTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/reports/ResultsReporterTest.java
new file mode 100644
index 0000000..5e43303
--- /dev/null
+++ 
b/tika-eval/src/test/java/org/apache/tika/eval/reports/ResultsReporterTest.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.reports;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.sql.Connection;
+import java.sql.Statement;
+
+import org.apache.tika.eval.db.H2Util;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+
+public class ResultsReporterTest {
+    private Path configFile;
+    private Path tmpDir;
+    private String dbName = "test-db";
+    private Connection connection;
+
+    @Before
+    public void setUp() throws Exception {
+        configFile = 
Paths.get(this.getClass().getResource("/reports.xml").toURI());
+        tmpDir = Files.createTempDirectory("tika-eval-report-test-");
+
+        connection = new H2Util(tmpDir.resolve(dbName)).getConnection(true);
+        String sql = "CREATE TABLE test_table (ID LONG PRIMARY KEY, STRING 
VARCHAR(32))";
+        Statement st = connection.createStatement();
+        st.execute(sql);
+        sql = "INSERT into test_table values ( 100000, 'the quick brown')";
+        st.execute(sql);
+        sql = "INSERT into test_table values (123456789, 'fox jumped over')";
+        st.execute(sql);
+        connection.commit();
+    }
+
+    @Test
+    @Ignore("add a real test here")
+    public void testBuilder() throws Exception {
+        ResultsReporter r = ResultsReporter.build(configFile);
+        r.execute(connection, Paths.get("reports"));
+        System.out.println("finished: "+ tmpDir.toString());
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java 
b/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java
new file mode 100644
index 0000000..486cac7
--- /dev/null
+++ 
b/tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java
@@ -0,0 +1,191 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.tokens;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
+import org.apache.commons.math3.util.FastMath;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.index.memory.MemoryIndex;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * Experimental class uses Lucene's MemoryIndex to effectively build the
+ * token info.
+ */
+public class LuceneTokenCounter {
+    private static final String ALPHA_IDEOGRAPH_SUFFIX = "_a";
+
+    private final LeafReader leafReader;
+    private final MemoryIndex memoryIndex;
+    private final Analyzer generalAnalyzer;
+    private final Analyzer alphaIdeographAnalyzer;
+    private int topN = 10;
+
+    Map<String, TokenStatistics> fieldStats = new HashMap<>();
+
+    public LuceneTokenCounter(Analyzer generalAnalyzer, Analyzer 
alphaIdeographAnalyzer) throws IOException {
+        memoryIndex = new MemoryIndex();
+        IndexSearcher searcher = memoryIndex.createSearcher();
+        leafReader = (LeafReader)searcher.getIndexReader();
+        this.generalAnalyzer = generalAnalyzer;
+        this.alphaIdeographAnalyzer = alphaIdeographAnalyzer;
+    }
+
+    public void add(String field, String content) throws IOException {
+        memoryIndex.addField(field, content, generalAnalyzer);
+        //memoryIndex.addField(field+ALPHA_IDEOGRAPH_SUFFIX,
+        //        content, alphaIdeographAnalyzer);
+        count(field);
+        //count(field+ALPHA_IDEOGRAPH_SUFFIX);
+
+    }
+
+
+    void count(String field) throws IOException {
+        long tokenCount = leafReader.getSumTotalTermFreq(field);
+        if (tokenCount > Integer.MAX_VALUE) {
+            throw new IllegalArgumentException("can't handle longs");
+        }
+        int tokenCountInt = (int)tokenCount;
+        int uniqueTokenCount = 0;
+        SummaryStatistics summStats = new SummaryStatistics();
+        double ent = 0.0d;
+        double p = 0.0d;
+        double base = 2.0;
+
+        Terms terms = leafReader.terms(field);
+        if (terms == null) {
+            //if there were no terms
+            fieldStats.put(field, new TokenStatistics(uniqueTokenCount, 
tokenCountInt,
+                    new TokenIntPair[0], ent, summStats));
+            return;
+
+        }
+        TermsEnum termsEnum = terms.iterator();
+        BytesRef bytesRef = termsEnum.next();
+        TokenCountPriorityQueue queue= new TokenCountPriorityQueue(topN);
+
+        while (bytesRef != null) {
+
+            long termFreq = termsEnum.totalTermFreq();
+            if (termFreq > Integer.MAX_VALUE) {
+                throw new IllegalArgumentException("Sorry can't handle longs 
yet");
+            }
+            int tf = (int)termFreq;
+            //TODO: figure out how to avoid Stringifying this
+            //to get codepoint count
+            String t = bytesRef.utf8ToString();
+            int len = t.codePointCount(0, t.length());
+            for (int i = 0; i < tf; i++) {
+                summStats.addValue(len);
+            }
+            p = (double) tf / (double) tokenCount;
+            ent += p * FastMath.log(base, p);
+
+            if (queue.top() == null || queue.size() < topN ||
+                    tf >= queue.top().getValue()) {
+                queue.insertWithOverflow(new TokenIntPair(t, tf));
+            }
+
+            uniqueTokenCount++;
+            bytesRef = termsEnum.next();
+        }
+        if (tokenCountInt > 0) {
+            ent = (-1.0d / (double)tokenCountInt) * ent;
+        }
+
+        fieldStats.put(field, new TokenStatistics(uniqueTokenCount, 
tokenCountInt,
+                queue.getArray(), ent, summStats));
+    }
+
+    public void setTopN(int topN) {
+        this.topN = topN;
+    }
+
+    public TokenStatistics getTokenStatistics(String field) {
+        return fieldStats.get(field);
+    }
+    public Terms getAlphaTerms(String field) throws IOException {
+        return leafReader.terms(field+ALPHA_IDEOGRAPH_SUFFIX);
+    }
+    public Terms getTerms(String field) throws IOException {
+        return leafReader.terms(field);
+    }
+
+
+    public void clear() {
+        memoryIndex.reset();
+        fieldStats.clear();
+    }
+/*
+    public ContrastStatistics contrast(String fieldA, String fieldB) throws 
IOException {
+        long diceDenom = getUniqueTokenCount(fieldA) +
+                getUniqueTokenCount(fieldB);
+
+        long diceNum = 0;
+        long overlapNum = 0;
+
+        Terms termsA = getTerms(fieldA);
+        Terms termsB = getTerms(fieldB);
+
+        TermsEnum termsEnumA = termsA.iterator();
+        TermsEnum termsEnumB = termsB.iterator();
+
+        BytesRef bytesRefA = termsEnumA.next();
+        BytesRef bytesRefB = termsEnumB.next();
+
+        while (bytesRefA != null) {
+            int compare = bytesRefA.compareTo(bytesRefB);
+            while (compare > 0) {
+                if (bytesRefB == null) {
+                    break;
+                }
+                //handle term in B, but not A
+
+                compare = bytesRefA.compareTo(bytesRefB);
+                bytesRefB = termsEnumB.next();
+            }
+            if (compare == 0) {
+                diceNum += 2;
+                overlapNum += 2 * Math.min(termsEnumA.totalTermFreq(), 
termsEnumB.totalTermFreq());
+            }
+
+            bytesRefA = termsEnumA.next();
+        }
+
+
+        for (PairCount p : tokens.values()) {
+            if (p.a > 0 && p.b > 0) {
+                diceNum += 2;
+                overlapNum += 2 * Math.min(p.a, p.b);
+            }
+        }
+
+        float dice = (float) diceNum / (float) diceDenom;
+        float overlap = (float) overlapNum / (float) 
(theseTokens.getTokenCount() + thoseTokens.getTokenCount());
+    }
+*/
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
new file mode 100644
index 0000000..719b56c
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.tokens;
+
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Random;
+
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class TokenCounterTest {
+    private final static String FIELD = "f";
+    private static AnalyzerManager analyzerManager;
+
+    private final int topN = 10;
+
+    @BeforeClass
+    public static void setUp() throws IOException {
+        analyzerManager = AnalyzerManager.newInstance();
+
+    }
+
+    @Test
+    public void testBasic() throws Exception {
+        String s = " bde cde def abc efg f f f f ghijklmnop a a a a a a a a a 
a a a a a a a a b b b b b b b b b b b b b";
+        TokenCounter counter = new 
TokenCounter(analyzerManager.getGeneralAnalyzer(),
+                analyzerManager.getAlphaIdeoAnalyzer());
+        counter.add(FIELD, s);
+        TokenStatistics simpleTokenStatistics = 
counter.getTokenStatistics(FIELD);
+        LuceneTokenCounter tokenCounter = new 
LuceneTokenCounter(analyzerManager.getGeneralAnalyzer(),
+                analyzerManager.getAlphaIdeoAnalyzer());
+        tokenCounter.add(FIELD, s);
+        assertEquals(simpleTokenStatistics, 
tokenCounter.getTokenStatistics(FIELD));
+    }
+
+    @Test
+    public void testRandom() throws Exception {
+
+        long simple = 0;
+        long lucene = 0;
+        int numberOfTests = 100;
+        for (int i = 0; i < numberOfTests; i++) {
+            String s = generateString();
+            long start = new Date().getTime();
+            TokenCounter counter = new 
TokenCounter(analyzerManager.getGeneralAnalyzer(),
+                    analyzerManager.getAlphaIdeoAnalyzer());
+            counter.add(FIELD, s);
+            simple += new Date().getTime()-start;
+            TokenStatistics simpleTokenStatistics = 
counter.getTokenStatistics(FIELD);
+
+            start = new Date().getTime();
+            LuceneTokenCounter tokenCounter = new 
LuceneTokenCounter(analyzerManager.getGeneralAnalyzer(),
+                    analyzerManager.getAlphaIdeoAnalyzer());
+            tokenCounter.add(FIELD, s);
+            lucene += new Date().getTime()-start;
+            assertEquals(s, simpleTokenStatistics, 
tokenCounter.getTokenStatistics(FIELD));
+        }
+
+        //System.out.println("SIMPLE: " + simple + " lucene: "+lucene);
+    }
+
+    @Test
+    public void testCommonTokens() throws Exception {
+        TokenCounter tokenCounter = new 
TokenCounter(analyzerManager.getGeneralAnalyzer(),
+                analyzerManager.getAlphaIdeoAnalyzer());
+        String s = "the http://www.cnn.com and blahdeb...@apache.org are in 
valuable www.sites.org æ®ææ¯é¡¿å¤§å¦";
+        tokenCounter.add(FIELD, s);
+        Map<String, MutableInt> tokens = tokenCounter.getAlphaTokens(FIELD);
+        assertEquals(new MutableInt(2), tokens.get("___url___"));
+        assertEquals(new MutableInt(1), tokens.get("___email___"));
+    }
+
+    @Test
+    public void testCJKFilter() throws Exception {
+        String s = "then quickbrownfoxjumpedoverthelazy dogss dog 
æ®ææ¯é¡¿å¤§å¦";
+        Analyzer analyzer = analyzerManager.getCommonTokensAnalyzer();
+        TokenStream ts = analyzer.tokenStream(FIELD, s);
+        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
+        ts.reset();
+        Map<String, Integer> tokens = new HashMap<>();
+        while (ts.incrementToken()) {
+            String t = termAtt.toString();
+            Integer count = tokens.get(t);
+            count = (count == null) ? count = 0 : count;
+            count++;
+            tokens.put(t, count);
+        }
+        ts.end();
+        ts.close();
+        assertEquals(7, tokens.size());
+        assertEquals(new Integer(1), tokens.get("ææ¯"));
+    }
+
+    private String generateString() {
+
+        Random r = new Random();
+        int len = r.nextInt(1000);
+        int uniqueVocabTerms = 10000;
+        StringBuilder sb = new StringBuilder();
+        for (int i = 0; i < len; i++) {
+            sb.append(Integer.toString(r.nextInt(uniqueVocabTerms)+100000));
+            sb.append(" ");
+        }
+        return sb.toString();
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/java/org/apache/tika/eval/util/MimeUtilTest.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/java/org/apache/tika/eval/util/MimeUtilTest.java 
b/tika-eval/src/test/java/org/apache/tika/eval/util/MimeUtilTest.java
new file mode 100644
index 0000000..40b7484
--- /dev/null
+++ b/tika-eval/src/test/java/org/apache/tika/eval/util/MimeUtilTest.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.util;
+
+
+import static junit.framework.TestCase.assertTrue;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.mime.MimeType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.mime.MimeTypes;
+import org.junit.Ignore;
+import org.junit.Test;
+
+@Ignore("Fix mimetype.getExtension to work with these and then we can get rid 
of MimeUtil")
+public class MimeUtilTest {
+
+    private final TikaConfig config = TikaConfig.getDefaultConfig();
+
+    @Test
+    public void testBasic() throws Exception {
+        assertResult("application/pdf", ".pdf");
+        assertResult("APPLICATION/PDF", ".pdf");
+        assertResult("text/plain; charset=ISO-8859-1", ".txt");
+        assertResult("application/xhtml+xml; charset=UTF-8\n", ".html");
+        assertResult("application/xml; charset=UTF-8\n", ".xml");
+
+        assertException("bogosity", "xml");
+    }
+
+    private void assertException(String contentType, String expected) {
+        boolean ex = false;
+        try {
+            assertResult(contentType, expected);
+        } catch (MimeTypeException e) {
+            ex = true;
+        }
+        assertTrue("Should have had exception for: " + contentType, ex);
+    }
+
+    private void assertResult(String contentType, String expected) throws 
MimeTypeException {
+        TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+        MimeTypes r = tikaConfig.getMimeRepository();
+        MimeType mt = r.forName(contentType);
+
+//        String ext = MimeUtil.getExtension(contentType, config);
+        assertEquals(expected, mt.getExtension());
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/commontokens/en
----------------------------------------------------------------------
diff --git a/tika-eval/src/test/resources/commontokens/en 
b/tika-eval/src/test/resources/commontokens/en
new file mode 100644
index 0000000..8d442fe
--- /dev/null
+++ b/tika-eval/src/test/resources/commontokens/en
@@ -0,0 +1,8 @@
+the
+of
+and
+a
+or
+#quick
+brown
+fox
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/commontokens/es
----------------------------------------------------------------------
diff --git a/tika-eval/src/test/resources/commontokens/es 
b/tika-eval/src/test/resources/commontokens/es
new file mode 100644
index 0000000..b9bfd03
--- /dev/null
+++ b/tika-eval/src/test/resources/commontokens/es
@@ -0,0 +1,10 @@
+la
+de
+y
+una
+
+
+o
+rÃ¡pido
+marrÃ³n
+zorro
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/commontokens/zh-cn
----------------------------------------------------------------------
diff --git a/tika-eval/src/test/resources/commontokens/zh-cn 
b/tika-eval/src/test/resources/commontokens/zh-cn
new file mode 100644
index 0000000..bec617d
--- /dev/null
+++ b/tika-eval/src/test/resources/commontokens/zh-cn
@@ -0,0 +1,8 @@
+ç
+ç
+å
+ä¸å
+è¦ä¹
+å¿«
+æ£è²
+çç¸
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/commontokens/zh-tw
----------------------------------------------------------------------
diff --git a/tika-eval/src/test/resources/commontokens/zh-tw 
b/tika-eval/src/test/resources/commontokens/zh-tw
new file mode 100644
index 0000000..bc91291
--- /dev/null
+++ b/tika-eval/src/test/resources/commontokens/zh-tw
@@ -0,0 +1,8 @@
+ç
+ç
+å
+ä¸ä¸ª
+è¦ä¹
+å¿«
+æ£è²
+çç¸
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/log4j.properties
----------------------------------------------------------------------
diff --git a/tika-eval/src/test/resources/log4j.properties 
b/tika-eval/src/test/resources/log4j.properties
new file mode 100644
index 0000000..925f9f2
--- /dev/null
+++ b/tika-eval/src/test/resources/log4j.properties
@@ -0,0 +1,11 @@
+
+log4j.rootLogger=WARN,A1
+
+#for debugging
+#log4j.rootLogger=TRACE,A1
+
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+
+# A1 uses PatternLayout.
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/log4j_process.properties
----------------------------------------------------------------------
diff --git a/tika-eval/src/test/resources/log4j_process.properties 
b/tika-eval/src/test/resources/log4j_process.properties
new file mode 100644
index 0000000..cca8871
--- /dev/null
+++ b/tika-eval/src/test/resources/log4j_process.properties
@@ -0,0 +1,11 @@
+
+log4j.rootLogger=TRACE,A1
+
+#for debugging
+#log4j.rootLogger=TRACE,A1
+
+log4j.appender.A1=org.apache.log4j.ConsoleAppender
+
+# A1 uses PatternLayout.
+log4j.appender.A1.layout=org.apache.log4j.PatternLayout
+log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml 
b/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
new file mode 100644
index 0000000..e8b9d6c
--- /dev/null
+++ b/tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
@@ -0,0 +1,72 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<tika-batch-config
+        maxAliveTimeSeconds="-1"
+        pauseOnEarlyTerminationMillis="500"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="5"
+        timeoutThresholdMillis="300000">
+
+    <commandline>
+        <option opt="c" longOpt="tika-config" hasArg="true"
+                description="TikaConfig file"/>
+
+        <option opt="bc" longOpt="batch-config" hasArg="true"
+                description="xml batch config file" required="true"/>
+        <option opt="inputDir" hasArg="true"
+                description="dir to start crawling"/>
+        <option opt="numConsumers" hasArg="true"
+                description="number of fileConsumers threads"/>
+        <option opt="extractDir" hasArg="true"
+                description="this dir contains the files containing extracted 
metadata/content" required="false"/>
+        <option opt="db" hasArg="true"
+                description="name of db directory or file to which to write 
results"/>
+    </commandline>
+
+
+    <!--
+        Can also add startDir: this tells the crawler to start indexing a
+        child directory of the inputDir directory.
+    -->
+    <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+             inputDir="src/test/resources/test-dirs/extractsA"
+             crawlOrder="sorted"
+             maxConsecWaitMillis="5000"
+             maxFilesToAdd="-1"
+             maxFilesToConsider="-1"
+             includeFilePat=""
+             excludeFilePat=""
+             maxFileSizeBytes="-1"
+            />
+
+    <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder"
+               
consumerBuilderClass="org.apache.tika.eval.batch.SingleFileConsumerBuilder"
+               
errorLogFile="src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml"
+               extractDir="src/test/resources/test-dirs/extractsA"
+               commonTokens="src/test/resources/common_tokens"/>
+
+
+    <!-- reporter and interrupter are optional -->
+    <reporter 
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" 
sleepMillis="1000"
+              staleThresholdMillis="500000"/>
+    <interrupter 
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml 
b/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml
new file mode 100644
index 0000000..da59d03
--- /dev/null
+++ b/tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml
@@ -0,0 +1,73 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<tika-batch-config
+        maxAliveTimeSeconds="-1"
+        pauseOnEarlyTerminationMillis="500"
+        timeoutCheckPulseMillis="1000"
+        maxQueueSize="10000"
+        numConsumers="5"
+        timeoutThresholdMillis="300000">
+
+    <commandline>
+        <option opt="c" longOpt="tika-config" hasArg="true"
+                description="TikaConfig file"/>
+
+        <option opt="bc" longOpt="batch-config" hasArg="true"
+                description="xml batch config file" required="true"/>
+        <option opt="inputDir" hasArg="true"
+                description="dir to start crawling"/>
+        <option opt="numConsumers" hasArg="true"
+                description="number of fileConsumers threads"/>
+        <option opt="extractDir" hasArg="true"
+                description="this dir contains the files containing extracted 
metadata/content" required="false"/>
+        <option opt="db" hasArg="true"
+                description="name of db directory or file to which to write 
results"/>
+    </commandline>
+
+
+    <!--
+        Can also add startDir: this tells the crawler to start indexing a
+        child directory of the inputDir directory.
+    -->
+    <crawler builderClass="org.apache.tika.batch.fs.builders.FSCrawlerBuilder"
+             inputDir="src/test/resources/test-dirs/raw_input"
+             crawlOrder="sorted"
+             maxConsecWaitMillis="5000"
+             maxFilesToAdd="-1"
+             maxFilesToConsider="-1"
+             includeFilePat=""
+             excludeFilePat=""
+             maxFileSizeBytes="-1"
+            />
+
+    <consumers builderClass="org.apache.tika.eval.batch.EvalConsumersBuilder"
+               
consumerBuilderClass="org.apache.tika.eval.batch.SingleFileConsumerBuilder"
+               
errorLogFile="src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml"
+               extractDir="src/test/resources/test-dirs/extractsA"
+               inputDir="src/test/resources/test-dirs/raw_input"
+               commonTokens="src/test/resources/common_tokens"/>
+
+
+    <!-- reporter and interrupter are optional -->
+    <reporter 
builderClass="org.apache.tika.batch.builders.SimpleLogReporterBuilder" 
sleepMillis="1000"
+              staleThresholdMillis="500000"/>
+    <interrupter 
builderClass="org.apache.tika.batch.builders.InterrupterBuilder"/>
+</tika-batch-config>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml 
b/tika-eval/src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml
new file mode 100644
index 0000000..520306b
--- /dev/null
+++ b/tika-eval/src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml
@@ -0,0 +1,59 @@
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<log4j:event logger="org.apache.tika.batch.FileResourceConsumer" 
timestamp="1436376775762" level="ERROR" thread="pool-2-thread-11">
+<log4j:message><![CDATA[<?xml version="1.0" ?><timed_out 
resourceId="file10_permahang.txt" 
elapsedMS="340302"></timed_out>]]></log4j:message>
+</log4j:event>
+
+<log4j:event logger="org.apache.tika.batch.FileResourceConsumer" 
timestamp="1436376775758" level="ERROR" thread="pool-2-thread-10">
+<log4j:message><![CDATA[<?xml version="1.0" ?><oom 
resourceId="file11_oom.txt">java.lang.OutOfMemoryError: Java heap space
+       at 
java.io.ByteArrayOutputStream.&lt;init&gt;(ByteArrayOutputStream.java:77)
+       at 
org.apache.fontbox.ttf.MemoryTTFDataStream.&lt;init&gt;(MemoryTTFDataStream.java:45)
+       at org.apache.fontbox.ttf.TTFParser.parse(TTFParser.java:96)
+       at 
org.apache.pdfbox.pdmodel.font.PDTrueTypeFont.&lt;init&gt;(PDTrueTypeFont.java:135)
+       at 
org.apache.pdfbox.pdmodel.font.PDFontFactory.createFont(PDFontFactory.java:75)
+       at org.apache.pdfbox.pdmodel.PDResources.getFont(PDResources.java:96)
+       at 
org.apache.pdfbox.contentstream.operator.text.SetFontAndSize.process(SetFontAndSize.java:50)
+       at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:795)
+       at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:462)
+       at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:438)
+       at 
org.apache.pdfbox.contentstream.PDFStreamEngine.showForm(PDFStreamEngine.java:178)
+       at 
org.apache.pdfbox.contentstream.operator.DrawObject.process(DrawObject.java:49)
+       at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processOperator(PDFStreamEngine.java:795)
+       at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStreamOperators(PDFStreamEngine.java:462)
+       at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processStream(PDFStreamEngine.java:438)
+       at 
org.apache.pdfbox.contentstream.PDFStreamEngine.processPage(PDFStreamEngine.java:149)
+       at 
org.apache.pdfbox.text.PDFTextStreamEngine.processPage(PDFTextStreamEngine.java:117)
+       at 
org.apache.pdfbox.text.PDFTextStripper.processPage(PDFTextStripper.java:369)
+       at 
org.apache.pdfbox.text.PDFTextStripper.processPages(PDFTextStripper.java:305)
+       at 
org.apache.pdfbox.text.PDFTextStripper.writeText(PDFTextStripper.java:249)
+       at org.apache.tika.parser.pdf.PDF2XHTML.process(PDF2XHTML.java:137)
+       at org.apache.tika.parser.pdf.PDFParser.parse(PDFParser.java:132)
+       at 
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:281)
+       at 
org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:281)
+       at 
org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
+       at 
org.apache.tika.parser.ParserDecorator.parse(ParserDecorator.java:177)
+       at org.apache.tika.parser.DigestingParser.parse(DigestingParser.java:74)
+       at 
org.apache.tika.parser.RecursiveParserWrapper.parse(RecursiveParserWrapper.java:158)
+       at 
org.apache.tika.batch.FileResourceConsumer.parse(FileResourceConsumer.java:410)
+       at 
org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer.processFileResource(RecursiveParserWrapperFSConsumer.java:104)
+       at 
org.apache.tika.batch.FileResourceConsumer._processFileResource(FileResourceConsumer.java:182)
+       at 
org.apache.tika.batch.FileResourceConsumer.call(FileResourceConsumer.java:115)
+</oom>]]></log4j:message>
+</log4j:event>
\ No newline at end of file

[2/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

Reply via email to