[4/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

tallison Thu, 16 Feb 2017 09:19:02 -0800

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java 
b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
new file mode 100644
index 0000000..cd90f76
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
@@ -0,0 +1,161 @@
+package org.apache.tika.eval.io;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import 
org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.z.ZCompressorInputStream;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.serialization.JsonMetadataList;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public class ExtractReader {
+
+    public enum ALTER_METADATA_LIST {
+        AS_IS,  //leave the metadata list as is
+        FIRST_ONLY, //take only the metadata list for the "container" document
+        CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into 
the first
+    }
+    private final static Logger LOGGER = 
LoggerFactory.getLogger(ExtractReader.class);
+    TikaConfig tikaConfig = TikaConfig.getDefaultConfig();
+
+    public List<Metadata> loadExtract(Path thisFile, ALTER_METADATA_LIST 
alterExtractList) {
+        List<Metadata> metadataList = null;
+        if (thisFile == null || !Files.isRegularFile(thisFile)) {
+            return metadataList;
+        }
+        Reader reader = null;
+        InputStream is = null;
+        FileSuffixes fileSuffixes = 
parseSuffixes(thisFile.getFileName().toString());
+        if (fileSuffixes.txtOrJson == null) {
+            LOGGER.warn("file must end with .txt or .json: 
"+thisFile.getFileName().toString());
+            return metadataList;
+        }
+
+        try {
+            is = Files.newInputStream(thisFile);
+            if (fileSuffixes.compression != null) {
+                if (fileSuffixes.compression.equals("bz2")) {
+                    is = new BZip2CompressorInputStream(is);
+                } else if (fileSuffixes.compression.equals("gz")) {
+                    is = new GzipCompressorInputStream(is);
+                } else if (fileSuffixes.compression.equals("zip")) {
+                    is = new ZCompressorInputStream(is);
+                } else {
+                    LOGGER.warn("Can't yet process compression of type: 
"+fileSuffixes.compression);
+                }
+            }
+                reader = new BufferedReader(new InputStreamReader(is, 
"UTF-8"));
+
+            if (fileSuffixes.txtOrJson.equals("json")) {
+                metadataList = JsonMetadataList.fromJson(reader);
+                if (alterExtractList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && 
metadataList.size() > 1) {
+                    while (metadataList.size() > 1) {
+                        metadataList.remove(metadataList.size()-1);
+                    }
+                } else if 
(alterExtractList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST)
 &&
+                        metadataList.size() > 1) {
+                    StringBuilder sb = new StringBuilder();
+                    Metadata containerMetadata = metadataList.get(0);
+                    for (int i = 0; i < metadataList.size(); i++) {
+                        Metadata m = metadataList.get(i);
+                        String c = m.get(RecursiveParserWrapper.TIKA_CONTENT);
+                        if (c != null) {
+                            sb.append(c);
+                            sb.append(" ");
+                        }
+                    }
+                    containerMetadata.set(RecursiveParserWrapper.TIKA_CONTENT, 
sb.toString());
+                    while (metadataList.size() > 1) {
+                        metadataList.remove(metadataList.size()-1);
+                    }
+                }
+
+
+            } else {
+                metadataList = generateListFromTextFile(reader, fileSuffixes);
+            }
+        } catch (IOException e) {
+            LOGGER.warn("couldn't open:" + thisFile.toAbsolutePath(), e);
+        } catch (TikaException e) {
+            LOGGER.warn("couldn't open:" + thisFile.toAbsolutePath(), e);
+        } finally {
+            IOUtils.closeQuietly(reader);
+            IOUtils.closeQuietly(is);
+        }
+        return metadataList;
+    }
+
+    private List<Metadata> generateListFromTextFile(Reader reader,
+                                                           FileSuffixes 
fileSuffixes) throws IOException {
+        List<Metadata> metadataList = new ArrayList<>();
+        String content = IOUtils.toString(reader);
+        Metadata m = new Metadata();
+        m.set(RecursiveParserWrapper.TIKA_CONTENT, content);
+        //Let's hope the file name has a suffix that can
+        //be used to determine the mime.  Could be wrong or missing,
+        //but better than nothing.
+        m.set(Metadata.RESOURCE_NAME_KEY, fileSuffixes.originalFileName);
+
+        MediaType mimeType = tikaConfig.getMimeRepository().detect(null, m);
+        if (mimeType != null) {
+            m.set(Metadata.CONTENT_TYPE, mimeType.toString());
+        }
+        metadataList.add(m);
+        return metadataList;
+
+    }
+
+    protected static FileSuffixes parseSuffixes(String fName) {
+        FileSuffixes fileSuffixes = new FileSuffixes();
+        if (fName == null) {
+            return fileSuffixes;
+        }
+        Matcher m = 
Pattern.compile("^(.*?)\\.(json|txt)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName);
+        if (m.find()) {
+            fileSuffixes.originalFileName = m.group(1);
+            fileSuffixes.txtOrJson = m.group(2);
+            fileSuffixes.compression = m.group(3);
+        }
+        return fileSuffixes;
+    }
+
+    private static class FileSuffixes {
+        String compression;
+        String txtOrJson;
+        String originalFileName;
+    }
+}


http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java 
b/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java
new file mode 100644
index 0000000..b2b76ab
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.io;
+
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.tika.eval.db.Cols;
+import org.apache.tika.eval.db.TableInfo;
+
+public interface IDBWriter {
+    public void writeRow(TableInfo table, Map<Cols, String> data) throws 
IOException;
+    public void close() throws IOException;
+    public int getMimeId(String mimeString);
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java 
b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java
new file mode 100644
index 0000000..2d509f7
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java
@@ -0,0 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.io;
+
+import java.io.IOException;
+import java.sql.SQLException;
+
+import org.apache.log4j.Level;
+
+public interface XMLLogMsgHandler {
+    public void handleMsg(Level level, String xml) throws IOException, 
SQLException;
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java 
b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
new file mode 100644
index 0000000..bb47b77
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.io;
+
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.sql.SQLException;
+
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.parser.ParseContext;
+
+
+public class XMLLogReader {
+
+    private final static Logger logger = Logger.getLogger(XMLLogReader.class);
+    //class that wraps a logger's xml output
+    //into a single xml parseable input stream.
+
+    public void read(InputStream xmlLogFileIs, XMLLogMsgHandler handler) 
throws XMLStreamException {
+        InputStream is = new LogXMLWrappingInputStream(xmlLogFileIs);
+       /* try {
+            System.out.println("WRAPPED: " + IOUtils.toString(is)+ 
"<<WRAPPED");
+        } catch (IOException e) {
+            e.printStackTrace();
+        }*/
+        XMLInputFactory factory = new ParseContext().getXMLInputFactory();
+        XMLStreamReader reader = factory.createXMLStreamReader(is);
+
+        Level level = null;
+        while (reader.hasNext()) {
+            reader.next();
+            switch (reader.getEventType()) {
+                case XMLStreamConstants.START_ELEMENT :
+                    if ("event".equals(reader.getLocalName())) {
+                        level = Level.toLevel(reader.getAttributeValue("", 
"level"), Level.DEBUG);
+                    } else if ("message".equals(reader.getLocalName())) {
+                        try {
+                            handler.handleMsg(level, reader.getElementText());
+                        } catch (IOException e) {
+                            e.printStackTrace();
+                            logger.warn("Error parsing: 
"+reader.getElementText());
+                        } catch (SQLException e) {
+                            e.printStackTrace();
+                            logger.warn("SQLException: "+e.getMessage());
+                        }
+                    }
+                    break;
+                case XMLStreamConstants.END_ELEMENT :
+                    if ("event".equals(reader.getLocalName())) {
+                        level = null;
+                    } else if ("message".equals(reader.getLocalName())) {
+                        //sdo we care any more?
+                    }
+                    break;
+            };
+        }
+    }
+
+
+
+    class LogXMLWrappingInputStream extends InputStream {
+        //plagiarized from log4j's chainsaw
+        private final static String HEADER =
+                "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+                        + "<log4j:eventSet version=\"1.2\" "
+                        + "xmlns:log4j=\"http://jakarta.apache.org/log4j/\";>";
+        private static final String FOOTER = "</log4j:eventSet>";
+
+        private InputStream[] streams;
+        int currentStreamIndex = 0;
+
+        private LogXMLWrappingInputStream(InputStream xmlLogFileIs){
+            streams = new InputStream[3];
+            streams[0] = new 
ByteArrayInputStream(HEADER.getBytes(IOUtils.UTF_8));
+            streams[1] = xmlLogFileIs;
+            streams[2] = new 
ByteArrayInputStream(FOOTER.getBytes(IOUtils.UTF_8));
+
+        }
+
+        @Override
+        public int read() throws IOException {
+            int c = streams[currentStreamIndex].read();
+            if (c < 0) {
+                IOUtils.closeQuietly(streams[currentStreamIndex]);
+                while (currentStreamIndex < streams.length-1) {
+                    currentStreamIndex++;
+                    int tmpC = streams[currentStreamIndex].read();
+                    if (tmpC < 0) {
+                        IOUtils.closeQuietly(streams[currentStreamIndex]);
+                    } else {
+                        return tmpC;
+                    }
+                }
+                return -1;
+            }
+            return c;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
----------------------------------------------------------------------
diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java 
b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
new file mode 100644
index 0000000..b7e2c09
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
@@ -0,0 +1,197 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.reports;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.sql.Connection;
+import java.sql.ResultSet;
+import java.sql.ResultSetMetaData;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.sql.Types;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.log4j.Logger;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.VerticalAlignment;
+import org.apache.poi.xssf.streaming.SXSSFSheet;
+import org.apache.poi.xssf.streaming.SXSSFWorkbook;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+/**
+ * This class represents a single report.
+ */
+public class Report {
+
+    static final Logger logger = Logger.getLogger(Report.class);
+
+    final String NULL_VALUE = "";//TODO: make this configurable!!!
+    Map<String, XSLXCellFormatter> cellFormatters = new HashMap<>();
+    private XLSXNumFormatter defaultDoubleFormatter = new 
XLSXNumFormatter("0.000");
+    private XLSXNumFormatter defaultIntegerFormatter = new 
XLSXNumFormatter("0");
+    private CellStyle sqlCellStyle;
+
+    String sql;
+    String reportFilename;
+    boolean includeSql = true;
+
+    String reportName;
+
+    public void writeReport(Connection c, Path reportsRoot) throws 
SQLException, IOException {
+        logger.info("Writing report: "+reportName + " to "+reportFilename);
+        dumpXLSX(c, reportsRoot);
+    }
+
+    private void dumpXLSX(Connection c, Path reportsRoot) throws IOException, 
SQLException {
+        Statement st = c.createStatement();
+        Path out = reportsRoot.resolve(reportFilename);
+        Files.createDirectories(out.getParent());
+
+        SXSSFWorkbook wb = new SXSSFWorkbook(new XSSFWorkbook(), 100, true, 
true);
+        wb.setCompressTempFiles(true);
+        defaultIntegerFormatter.reset(wb.getXSSFWorkbook());
+        defaultDoubleFormatter.reset(wb.getXSSFWorkbook());
+        sqlCellStyle = wb.createCellStyle();
+        sqlCellStyle.setVerticalAlignment(VerticalAlignment.TOP);
+        sqlCellStyle.setWrapText(true);
+
+
+        try {
+            dumpReportToWorkbook(st, wb);
+        } finally {
+            try (OutputStream os = Files.newOutputStream(out)) {
+                wb.write(os);
+            } finally {
+                wb.dispose();
+            }
+        }
+    }
+
+    private void dumpReportToWorkbook(Statement st, SXSSFWorkbook wb) throws 
IOException, SQLException {
+        ResultSet rs = st.executeQuery(sql);
+
+        SXSSFSheet sheet = wb.createSheet("tika-eval Report");
+        sheet.trackColumnForAutoSizing(0);
+
+        int rowCount = 0;
+        ResultSetMetaData meta = rs.getMetaData();
+        Set<String> colNames = new HashSet<>();
+
+        Row xssfRow = sheet.createRow(rowCount++);
+        //write headers and cache them to check against styles
+        for (int i = 1; i <= meta.getColumnCount(); i++) {
+            Cell cell = xssfRow.createCell(i-1);
+            cell.setCellValue(meta.getColumnLabel(i));
+            colNames.add(meta.getColumnLabel(i));
+        }
+
+        ResultSetMetaData resultSetMetaData = rs.getMetaData();
+        while (rs.next()) {
+            xssfRow = sheet.createRow(rowCount++);
+            for (int i = 1; i <= meta.getColumnCount(); i++) {
+                Cell cell = xssfRow.createCell(i-1);
+                XSLXCellFormatter formatter = 
cellFormatters.get(meta.getColumnLabel(i));
+                if (formatter == null) {
+                    formatter = 
getDefaultFormatter(resultSetMetaData.getColumnType(i));
+                }
+                if (formatter != null) {
+                    formatter.applyStyleAndValue(i, rs, cell);
+                } else {
+                    writeCell(meta, i, rs, cell);
+                }
+            }
+        }
+        sheet.autoSizeColumn(0);
+
+        if (!includeSql) {
+            return;
+        }
+
+        SXSSFSheet sqlSheet = wb.createSheet("tika-eval SQL");
+        sqlSheet.setColumnWidth(0, 100*250);
+        Row sqlRow = sqlSheet.createRow(0);
+        short height = 5000;
+        sqlRow.setHeight(height);
+        Cell cell = sqlRow.createCell(0);
+        cell.setCellStyle(sqlCellStyle);
+
+        cell.setCellValue(sql.trim());//.replaceAll("[\r\n]+", "\r\n"));
+    }
+
+    private XSLXCellFormatter getDefaultFormatter(int columnType) {
+        switch (columnType) {
+            case Types.INTEGER :
+                return defaultIntegerFormatter;
+            case Types.DOUBLE:
+            case Types.FLOAT:
+            case Types.DECIMAL:
+                return defaultDoubleFormatter;
+            default:
+                return null;
+        }
+    }
+
+    private void writeCell(ResultSetMetaData meta, int colIndex, ResultSet rs,
+                           Cell cell) throws SQLException {
+
+        switch(meta.getColumnType(colIndex)) {
+            //fall through on numerics
+            case Types.BIGINT:
+            case Types.SMALLINT:
+            case Types.INTEGER:
+            case Types.DOUBLE:
+            case Types.FLOAT:
+            case Types.DECIMAL:
+            case Types.NUMERIC:
+                double dbl = rs.getDouble(colIndex);
+                if (rs.wasNull()) {
+                    cell.setCellValue(NULL_VALUE);
+                } else {
+                    cell.setCellValue(dbl);
+                }
+                break;
+            //fall through strings
+            case Types.CHAR:
+            case Types.VARCHAR:
+            case Types.LONGNVARCHAR:
+                String val = rs.getString(colIndex);
+                if (rs.wasNull()) {
+                    cell.setCellValue(NULL_VALUE);
+                } else {
+                    cell.setCellValue(val);
+                }
+                break;
+            default:
+                if (rs.wasNull()) {
+                    cell.setCellValue(NULL_VALUE);
+                } else {
+                    cell.setCellValue(rs.getString(colIndex));
+                }
+                logger.warn("Couldn't find type for: " + 
meta.getColumnType(colIndex) +
+                        ". Defaulting to String");
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java 
b/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
new file mode 100644
index 0000000..97b7711
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.reports;
+
+
+import javax.xml.parsers.DocumentBuilder;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+import java.sql.Connection;
+import java.sql.DatabaseMetaData;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+import org.apache.poi.common.usermodel.Hyperlink;
+import org.apache.tika.eval.ExtractComparer;
+import org.apache.tika.eval.ExtractProfiler;
+import org.apache.tika.eval.db.DBUtil;
+import org.apache.tika.eval.db.H2Util;
+import org.apache.tika.parser.ParseContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+
+public class ResultsReporter {
+
+    protected static Logger LOGGER = 
LoggerFactory.getLogger(ResultsReporter.class);
+
+    private static Options OPTIONS;
+
+    static {
+        OPTIONS = new Options();
+
+        Option db = new Option("db", "database");
+        db.setRequired(true);
+        db.setArgs(1);
+
+        OPTIONS.addOption(db)
+                .addOption("rd", "reportsDir", true, "directory for the 
reports. " +
+                                "If not specified, will write to 'reports'" +
+                        "BEWARE: Will overwrite existing reports without 
warning!"
+                )
+                .addOption("rf", "reportsFile", true, "xml specifying sql to 
call for the reports." +
+                        "If not specified, will use default reports in 
resources/tika-eval-*-config.xml");
+
+    }
+
+    public static void USAGE() {
+        HelpFormatter helpFormatter = new HelpFormatter();
+        helpFormatter.printHelp(
+                80,
+                "java -jar tika-eval-x.y.jar Report -db mydb [-rd myreports] 
[-rf myreports.xml]",
+                "Tool: Report",
+                ResultsReporter.OPTIONS,
+                "Note: for h2 db, do not include the .mv.db at the end of the 
db name.");
+
+    }
+
+
+    List<String> before = new ArrayList<>();
+    List<String> after = new ArrayList<>();
+    List<Report> reports = new ArrayList<>();
+
+
+    private void addBefore(String b) {
+        before.add(b);
+    }
+
+    private void addAfter(String a) {
+        after.add(a);
+    }
+
+    private void addReport(Report r) {
+        reports.add(r);
+    }
+
+    public static ResultsReporter build(Path p) throws Exception {
+
+        ResultsReporter r = new ResultsReporter();
+
+        DocumentBuilder docBuilder = new ParseContext().getDocumentBuilder();
+        Document doc;
+        try(InputStream is = Files.newInputStream(p)) {
+            doc = docBuilder.parse(is);
+        }
+        Node docElement = doc.getDocumentElement();
+        assert(docElement.getNodeName().equals("reports"));
+        NodeList children = docElement.getChildNodes();
+        for (int i = 0; i < children.getLength(); i++) {
+            Node n = children.item(i);
+            if ("before".equals(n.getNodeName())) {
+                for (String before : getSql(n)) {
+                    r.addBefore(before);
+                }
+            } else if ("after".equals(n.getNodeName())) {
+                for (String after : getSql(n)) {
+                    r.addAfter(after);
+                }
+            } else if ("report".equals(n.getNodeName())) {
+                Report report = buildReport(n);
+                r.addReport(report);
+            }
+        }
+
+        return r;
+    }
+
+    private static Report buildReport(Node n) {
+        NodeList children = n.getChildNodes();
+        Report r = new Report();
+        NamedNodeMap attrs = n.getAttributes();
+
+        r.includeSql = 
Boolean.parseBoolean(attrs.getNamedItem("includeSql").getNodeValue());
+        r.reportFilename = attrs.getNamedItem("reportFilename").getNodeValue();
+        r.reportName = attrs.getNamedItem("reportName").getNodeValue();
+
+        for (int i = 0; i < children.getLength(); i++) {
+            Node child = children.item(i);
+            if (child.getNodeType() != 1) {
+                continue;
+            }
+            if ("sql".equals(child.getNodeName())) {
+                if (r.sql != null) {
+                    throw new IllegalArgumentException("Can only have one sql 
statement per report");
+                }
+                r.sql = child.getTextContent();
+            } else if ("colformats".equals(child.getNodeName())) {
+                r.cellFormatters = getCellFormatters(child);
+            } else {
+                throw new IllegalArgumentException("Not expecting to 
see:"+child.getNodeName());
+            }
+        }
+        return r;
+    }
+
+    private static Map<String, XSLXCellFormatter> getCellFormatters(Node n) {
+        NodeList children = n.getChildNodes();
+        Map<String, XSLXCellFormatter> ret = new HashMap<>();
+        for (int i = 0; i < children.getLength(); i++) {
+            Node child = children.item(i);
+            if (child.getNodeType() != 1) {
+                continue;
+            }
+            NamedNodeMap attrs = child.getAttributes();
+            String columnName = attrs.getNamedItem("name").getNodeValue();
+            assert(!ret.containsKey(columnName));
+            String type = attrs.getNamedItem("type").getNodeValue();
+            if ("numberFormatter".equals(type)) {
+                String format = attrs.getNamedItem("format").getNodeValue();
+                XSLXCellFormatter f = new XLSXNumFormatter(format);
+                ret.put(columnName,f);
+            } else if ("urlLink".equals(type)) {
+                String base = "";
+                Node baseNode = attrs.getNamedItem("base");
+                if (baseNode != null) {
+                    base = baseNode.getNodeValue();
+                }
+                XLSXHREFFormatter f = new XLSXHREFFormatter(base, 
Hyperlink.LINK_URL);
+                ret.put(columnName, f);
+            } else if ("fileLink".equals(type)) {
+                String base = "";
+                Node baseNode = attrs.getNamedItem("base");
+                if (baseNode != null) {
+                    base = baseNode.getNodeValue();
+                }
+                XLSXHREFFormatter f = new XLSXHREFFormatter(base, 
Hyperlink.LINK_FILE);
+                ret.put(columnName, f);
+            }
+        }
+        return ret;
+    }
+
+    private static List<String> getSql(Node n) {
+        List<String> ret = new ArrayList<>();
+
+        NodeList children = n.getChildNodes();
+
+        for (int i = 0; i < children.getLength(); i++) {
+            Node child = children.item(i);
+            if (child.getNodeType() != 1) {
+                continue;
+            }
+            ret.add(child.getTextContent());
+        }
+        return ret;
+    }
+
+    public static void main(String[] args) throws Exception {
+
+        DefaultParser defaultCLIParser = new DefaultParser();
+        CommandLine commandLine = null;
+        try {
+            commandLine = defaultCLIParser.parse(OPTIONS, args);
+        } catch (ParseException e) {
+            System.out.println(e.getMessage());
+            USAGE();
+            return;
+        }
+        Path db = Paths.get(commandLine.getOptionValue("db"));
+        DBUtil dbUtil = new H2Util(db);
+
+        try (Connection c = dbUtil.getConnection(true)) {
+            Path tmpReportsFile = null;
+            try {
+                ResultsReporter resultsReporter = null;
+                String reportsFile = commandLine.getOptionValue("rf");
+                if (reportsFile == null) {
+                    tmpReportsFile = getDefaultReportsConfig(c);
+                    resultsReporter = ResultsReporter.build(tmpReportsFile);
+                } else {
+                    resultsReporter = 
ResultsReporter.build(Paths.get(reportsFile));
+                }
+
+                Path reportsRootDirectory = 
Paths.get(commandLine.getOptionValue("rd", "reports"));
+                if (Files.isDirectory(reportsRootDirectory)) {
+                    LOGGER.warn("'Reports' directory exists.  Will overwrite 
existing reports.");
+                }
+
+                resultsReporter.execute(c, reportsRootDirectory);
+            } finally {
+                if (tmpReportsFile != null) {
+                    Files.delete(tmpReportsFile);
+                }
+            }
+        }
+    }
+
+    private static Path getDefaultReportsConfig(Connection c) throws 
IOException, SQLException {
+        DatabaseMetaData md = c.getMetaData();
+        String internalPath = null;
+        try (ResultSet rs = md.getTables(null, null, "%", null)) {
+            while (rs.next()) {
+                String tName = rs.getString(3);
+                if 
(ExtractComparer.CONTENTS_TABLE_B.getName().equalsIgnoreCase(tName)) {
+                    internalPath = "/comparison-reports.xml";
+                    break;
+                } else if 
(ExtractProfiler.PROFILE_TABLE.getName().equalsIgnoreCase(tName)) {
+                    internalPath = "/profile-reports.xml";
+                    break;
+                }
+            }
+        }
+
+        if (internalPath == null) {
+            throw new RuntimeException("Couldn't determine if this database 
was a 'profiler' or 'comparison' db");
+        }
+        Path tmp = Files.createTempFile("tmp-tika-reports", ".xml");
+        Files.copy(ResultsReporter.class.getResourceAsStream(internalPath), 
tmp, StandardCopyOption.REPLACE_EXISTING);
+        return tmp;
+    }
+
+    public void execute(Connection c, Path reportsDirectory) throws 
IOException, SQLException {
+        Statement st = c.createStatement();
+        for (String sql : before) {
+            st.execute(sql);
+        }
+        for (Report r : reports) {
+            r.writeReport(c, reportsDirectory);
+        }
+        for (String sql : after) {
+            st.execute(sql);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java 
b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java
new file mode 100644
index 0000000..52d55e4
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.reports;
+
+import java.nio.file.Paths;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.ss.usermodel.Font;
+import org.apache.poi.ss.usermodel.Hyperlink;
+import org.apache.poi.ss.usermodel.IndexedColors;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+/**
+ * Created by TALLISON on 4/20/2016.
+ */
+public class XLSXHREFFormatter implements XSLXCellFormatter {
+    //xlsx files can only have this many hyperlinks
+    //if they have more Excel can't read the file
+    private static final int MAX_HYPERLINKS = 65000;
+
+
+    private final String urlBase;
+    private final int linkType;
+    private XSSFWorkbook workbook;
+    private CellStyle style;
+    private int links = 0;
+
+    public XLSXHREFFormatter(String urlBase,
+                             int linkType) {
+        this.urlBase = urlBase;
+        this.linkType = linkType;
+    }
+
+    @Override
+    public void reset(XSSFWorkbook workbook) {
+        this.workbook = workbook;
+        style = workbook.createCellStyle();
+        Font hlinkFont = workbook.createFont();
+        hlinkFont.setUnderline(Font.U_SINGLE);
+        hlinkFont.setColor(IndexedColors.BLUE.getIndex());
+        style.setFont(hlinkFont);
+        links = 0;
+
+    }
+
+    @Override
+    public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell 
cell) throws SQLException {
+        if (links < MAX_HYPERLINKS) {
+            Hyperlink hyperlink = 
workbook.getCreationHelper().createHyperlink(linkType);
+            String path = resultSet.getString(dbColNum);
+            String address = urlBase+path;
+            hyperlink.setAddress(address);
+            cell.setHyperlink(hyperlink);
+            cell.setCellStyle(style);
+            String fName = Paths.get(path).getFileName().toString();
+            cell.setCellValue(fName);
+            links++;
+        } else {
+            //silently stop adding hyperlinks
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java 
b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java
new file mode 100644
index 0000000..b2ba5b0
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.reports;
+
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellStyle;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+class XLSXNumFormatter implements XSLXCellFormatter {
+
+    private final String formatString;
+    private CellStyle style;
+
+    XLSXNumFormatter(String formatString) {
+        this.formatString = formatString;
+    }
+
+
+    @Override
+    public void reset(XSSFWorkbook workbook) {
+        style = workbook.createCellStyle();
+        style.setDataFormat(workbook.getCreationHelper()
+                .createDataFormat().getFormat(formatString));
+    }
+
+    @Override
+    public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell 
cell) throws SQLException {
+        double d = resultSet.getDouble(dbColNum);
+        if (resultSet.wasNull()) {
+
+        } else {
+            cell.setCellStyle(style);
+        }
+        cell.setCellValue(resultSet.getDouble(dbColNum));
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java 
b/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java
new file mode 100644
index 0000000..eb0e024
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.reports;
+
+
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
+
+interface XSLXCellFormatter {
+
+    public void reset(XSSFWorkbook workbook);
+    public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell 
cell) throws SQLException;
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
new file mode 100644
index 0000000..fb72e84
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.tokens;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Factory for filter that only allows tokens with characters that 
"isAlphabetic"  or "isIdeographic" through.
+ */
+public class AlphaIdeographFilterFactory extends TokenFilterFactory {
+
+
+
+    public AlphaIdeographFilterFactory(Map<String, String> args) {
+        super(args);
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new AlphaFilter(tokenStream);
+    }
+
+    /**
+     * Remove tokens tokens that do not contain an "
+     */
+    private class AlphaFilter extends FilteringTokenFilter {
+
+        private final CharTermAttribute termAtt = 
addAttribute(CharTermAttribute.class);
+
+        public AlphaFilter(TokenStream in) {
+            super(in);
+        }
+
+        @Override
+        protected boolean accept() throws IOException {
+            char[] buff = termAtt.buffer();
+            for (int i = 0; i < termAtt.length(); i++) {
+                int cp = buff[i];
+                if (Character.isHighSurrogate(buff[i])) {
+                    if (i < termAtt.length()-1) {
+                        cp = Character.toCodePoint(buff[i], buff[i + 1]);
+                        i++;
+                    }
+                }
+
+                if (Character.isAlphabetic(cp) ||
+                        Character.isIdeographic(cp)) {
+                    return true;
+                }
+            }
+            return false;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
new file mode 100644
index 0000000..cfc2d4f
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
@@ -0,0 +1,345 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.tokens;
+
+
+import java.io.IOException;
+import java.io.Reader;
+import java.lang.reflect.Type;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import com.google.gson.JsonArray;
+import com.google.gson.JsonDeserializationContext;
+import com.google.gson.JsonDeserializer;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import com.google.gson.JsonParseException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.util.CharFilterFactory;
+import org.apache.lucene.analysis.util.ClasspathResourceLoader;
+import org.apache.lucene.analysis.util.ResourceLoaderAware;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+import org.apache.lucene.analysis.util.TokenizerFactory;
+
+class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> {
+
+
+    private static String ANALYZERS = "analyzers";
+    private static String CHAR_FILTERS = "charfilters";
+    private static String TOKEN_FILTERS = "tokenfilters";
+    private static String TOKENIZER = "tokenizer";
+    private static String FACTORY = "factory";
+    private static String PARAMS = "params";
+
+    @Override
+    public Map<String, Analyzer> deserialize(JsonElement element, Type type,
+                                             JsonDeserializationContext 
jsonDeserializationContext) throws JsonParseException {
+        if (! element.isJsonObject()) {
+            throw new IllegalArgumentException("Expecting top level 
'analyzers:{}'");
+        }
+
+        JsonElement root = element.getAsJsonObject().get(ANALYZERS);
+        if (root == null) {
+            throw new IllegalArgumentException("Expecting top level 
'analyzers:{}");
+        }
+        try {
+            return buildAnalyzers(root);
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+
+    }
+
+    public static Map<String, Analyzer> buildAnalyzers(JsonElement value) 
throws IOException {
+        if (! value.isJsonObject()) {
+            throw new IllegalArgumentException("Expecting map with analyzer 
names/analyzer definitions");
+        }
+        Map<String, Analyzer> analyzers = new HashMap<>();
+        JsonObject root = (JsonObject)value;
+        for (Map.Entry<String, JsonElement> e : root.entrySet()) {
+            String analyzerName = e.getKey();
+            Analyzer analyzer = buildAnalyzer(analyzerName, e.getValue());
+            analyzers.put(analyzerName, analyzer);
+        }
+        return analyzers;
+    }
+
+    public static Analyzer buildAnalyzer(String analyzerName, JsonElement 
value) throws IOException {
+        if (! value.isJsonObject()) {
+            throw new IllegalArgumentException("Expecting map of charfilter, 
tokenizer, tokenfilters");
+        }
+        JsonObject aRoot = (JsonObject)value;
+        CharFilterFactory[] charFilters = new CharFilterFactory[0];
+        TokenizerFactory tokenizerFactory = null;
+        TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0];
+        for ( Map.Entry<String, JsonElement> e : aRoot.entrySet()) {
+            String k = e.getKey();
+            if (k.equals(CHAR_FILTERS)) {
+                charFilters = buildCharFilters(e.getValue(), analyzerName);
+            } else if (k.equals(TOKEN_FILTERS)) {
+                tokenFilterFactories = buildTokenFilterFactories(e.getValue(), 
analyzerName);
+            } else if (k.equals(TOKENIZER)) {
+                tokenizerFactory = buildTokenizerFactory(e.getValue(), 
analyzerName);
+            } else {
+                throw new IllegalArgumentException("Should have one of three 
values here:"+
+                        CHAR_FILTERS + ", "+
+                        TOKENIZER+", "+
+                        TOKEN_FILTERS +
+                        ". I don't recognize: "+k);
+            }
+        }
+        if (tokenizerFactory == null) {
+            throw new IllegalArgumentException("Must specify at least a 
tokenizer factory for an analyzer!");
+        }
+        return new MyTokenizerChain(charFilters, tokenizerFactory, 
tokenFilterFactories);
+    }
+
+    private static TokenizerFactory buildTokenizerFactory(JsonElement map, 
String analyzerName) throws IOException {
+        if (!(map instanceof JsonObject)) {
+            throw new IllegalArgumentException("Expecting a map with 
\"factory\" string and " +
+                    "\"params\" map in tokenizer factory;"+
+                    " not: "+map.toString() + " in "+analyzerName);
+        }
+        JsonElement factoryEl = ((JsonObject)map).get(FACTORY);
+        if (factoryEl == null || ! factoryEl.isJsonPrimitive()) {
+            throw new IllegalArgumentException("Expecting value for factory in 
char filter factory builder in:"+
+                    analyzerName);
+        }
+        String factoryName = factoryEl.getAsString();
+        factoryName = factoryName.startsWith("oala.") ?
+                factoryName.replaceFirst("oala.", 
"org.apache.lucene.analysis.") : factoryName;
+
+        JsonElement paramsEl = ((JsonObject)map).get(PARAMS);
+        Map<String, String> params = mapify(paramsEl);
+        String spiName = "";
+        for (String s : TokenizerFactory.availableTokenizers()) {
+            Class clazz = TokenizerFactory.lookupClass(s);
+            if (clazz.getName().equals(factoryName)) {
+                spiName = s;
+                break;
+            }
+        }
+        if (spiName.equals("")) {
+            throw new IllegalArgumentException("A SPI class of type 
org.apache.lucene.analysis.util.TokenizerFactory with name"+
+            "'"+factoryName+"' does not exist.");
+        }
+        try {
+            TokenizerFactory tokenizerFactory = 
TokenizerFactory.forName(spiName, params);
+            if (tokenizerFactory instanceof ResourceLoaderAware) {
+                ((ResourceLoaderAware) tokenizerFactory).inform(new 
ClasspathResourceLoader(AnalyzerDeserializer.class));
+            }
+
+            return tokenizerFactory;
+        } catch (IllegalArgumentException e) {
+            throw new IllegalArgumentException("While working on 
"+analyzerName, e);
+        }
+    }
+
+    private static CharFilterFactory[] buildCharFilters(JsonElement el, String 
analyzerName) throws IOException {
+        if (el == null || el.isJsonNull()) {
+            return null;
+        }
+        if (! el.isJsonArray()) {
+            throw new IllegalArgumentException("Expecting array for 
charfilters, but got:"+el.toString() +
+                    " for "+analyzerName);
+        }
+        JsonArray jsonArray = (JsonArray)el;
+        List<CharFilterFactory> ret = new LinkedList<CharFilterFactory>();
+        for (JsonElement filterMap : jsonArray) {
+            if (!(filterMap instanceof JsonObject)) {
+                throw new IllegalArgumentException("Expecting a map with 
\"factory\" string and \"params\" map in char filter factory;"+
+                        " not: "+filterMap.toString() + " in "+analyzerName);
+            }
+            JsonElement factoryEl = ((JsonObject)filterMap).get(FACTORY);
+            if (factoryEl == null || ! factoryEl.isJsonPrimitive()) {
+                throw new IllegalArgumentException(
+                        "Expecting value for factory in char filter factory 
builder in:"+analyzerName);
+            }
+            String factoryName = factoryEl.getAsString();
+            factoryName = factoryName.replaceAll("oala.", 
"org.apache.lucene.analysis.");
+
+            JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS);
+            Map<String, String> params = mapify(paramsEl);
+            String spiName = "";
+            for (String s : CharFilterFactory.availableCharFilters()) {
+                Class clazz = CharFilterFactory.lookupClass(s);
+                if (clazz.getName().equals(factoryName)) {
+                    spiName = s;
+                    break;
+                }
+            }
+            if (spiName.equals("")) {
+                throw new IllegalArgumentException("A SPI class of type 
org.apache.lucene.analysis.util.CharFilterFactory with name"+
+                        "'"+factoryName+"' does not exist.");
+            }
+
+            try {
+                CharFilterFactory charFilterFactory = 
CharFilterFactory.forName(spiName, params);
+                if (charFilterFactory instanceof ResourceLoaderAware) {
+                    ((ResourceLoaderAware) charFilterFactory).inform(new 
ClasspathResourceLoader(AnalyzerDeserializer.class));
+                }
+                ret.add(charFilterFactory);
+            } catch (IllegalArgumentException e) {
+                throw new IllegalArgumentException("While trying to load "+
+                        analyzerName + ": "+ e.getMessage(), e);
+            }
+        }
+        if (ret.size() == 0) {
+            return new CharFilterFactory[0];
+        }
+        return ret.toArray(new CharFilterFactory[ret.size()]);
+    }
+
+    private static TokenFilterFactory[] buildTokenFilterFactories(JsonElement 
el,
+                                                                  String 
analyzerName) throws IOException {
+        if (el == null || el.isJsonNull()) {
+            return null;
+        }
+        if (! el.isJsonArray()) {
+            throw new IllegalArgumentException(
+                    "Expecting array for tokenfilters, but got:"+el.toString() 
+ " in "+analyzerName);
+        }
+        JsonArray jsonArray = (JsonArray)el;
+        List<TokenFilterFactory> ret = new LinkedList<>();
+        for (JsonElement filterMap : jsonArray) {
+            if (!(filterMap instanceof JsonObject)) {
+                throw new IllegalArgumentException("Expecting a map with 
\"factory\" string and \"params\" map in token filter factory;"+
+                        " not: "+filterMap.toString() + " in "+ analyzerName);
+            }
+            JsonElement factoryEl = ((JsonObject)filterMap).get(FACTORY);
+            if (factoryEl == null || ! factoryEl.isJsonPrimitive()) {
+                throw new IllegalArgumentException("Expecting value for 
factory in token filter factory builder in "+analyzerName);
+            }
+            String factoryName = factoryEl.getAsString();
+            factoryName = factoryName.startsWith("oala.") ?
+                    factoryName.replaceFirst("oala.", 
"org.apache.lucene.analysis.") :
+                    factoryName;
+
+            JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS);
+            Map<String, String> params = mapify(paramsEl);
+            String spiName = "";
+            for (String s : TokenFilterFactory.availableTokenFilters()) {
+                Class clazz = TokenFilterFactory.lookupClass(s);
+                if (clazz.getName().equals(factoryName)) {
+                    spiName = s;
+                    break;
+                }
+            }
+            if (spiName.equals("")) {
+                throw new IllegalArgumentException("A SPI class of type 
org.apache.lucene.analysis.util.TokenFilterFactory with name"+
+                        "'"+factoryName+"' does not exist.");
+            }
+
+            try {
+                TokenFilterFactory tokenFilterFactory = 
TokenFilterFactory.forName(spiName, params);
+                if (tokenFilterFactory instanceof ResourceLoaderAware) {
+                    ((ResourceLoaderAware) tokenFilterFactory).inform(new 
ClasspathResourceLoader(AnalyzerDeserializer.class));
+                }
+                ret.add(tokenFilterFactory);
+            } catch (IllegalArgumentException e) {
+                throw new IllegalArgumentException("While loading 
"+analyzerName, e);
+            }
+        }
+        if (ret.size() == 0) {
+            return new TokenFilterFactory[0];
+        }
+        return ret.toArray(new TokenFilterFactory[ret.size()]);
+    }
+
+    private static  Map<String, String> mapify(JsonElement paramsEl) {
+        if (paramsEl == null || paramsEl.isJsonNull()) {
+            return Collections.EMPTY_MAP;
+        }
+        if (! paramsEl.isJsonObject()) {
+            throw new IllegalArgumentException("Expecting map, not: 
"+paramsEl.toString());
+        }
+        Map<String, String> params = new HashMap<>();
+        for (Map.Entry<String,JsonElement> e : 
((JsonObject)paramsEl).entrySet()) {
+            JsonElement value = e.getValue();
+            if (! value.isJsonPrimitive()) {
+                throw new IllegalArgumentException("Expecting parameter to 
have primitive value: "+value.toString());
+            }
+            String v = e.getValue().getAsString();
+            params.put(e.getKey(), v);
+        }
+        return params;
+    }
+
+    /**
+     * Plagiarized verbatim from Solr!
+     */
+    private static class MyTokenizerChain extends Analyzer {
+
+        final private CharFilterFactory[] charFilters;
+        final private TokenizerFactory tokenizer;
+        final private TokenFilterFactory[] filters;
+
+        public MyTokenizerChain(TokenizerFactory tokenizer, 
TokenFilterFactory[] filters) {
+            this(null, tokenizer, filters);
+        }
+
+        public MyTokenizerChain(CharFilterFactory[] charFilters, 
TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
+            this.charFilters = charFilters;
+            this.tokenizer = tokenizer;
+            this.filters = filters;
+        }
+
+        public CharFilterFactory[] getCharFilterFactories() {
+            return charFilters;
+        }
+
+        public TokenizerFactory getTokenizerFactory() {
+            return tokenizer;
+        }
+
+        public TokenFilterFactory[] getTokenFilterFactories() {
+            return filters;
+        }
+
+        @Override
+        public Reader initReader(String fieldName, Reader reader) {
+
+            if (charFilters != null && charFilters.length > 0) {
+                Reader cs = reader;
+                for (CharFilterFactory charFilter : charFilters) {
+                    cs = charFilter.create(cs);
+                }
+                reader = cs;
+            }
+
+            return reader;
+        }
+
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName) {
+            Tokenizer tk = tokenizer.create();
+            TokenStream ts = tk;
+            for (TokenFilterFactory filter : filters) {
+                ts = filter.create(ts);
+            }
+
+            return new TokenStreamComponents(tk, ts);
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
new file mode 100644
index 0000000..db6ae26
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.tokens;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Map;
+
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import com.google.gson.JsonParseException;
+import org.apache.lucene.analysis.Analyzer;
+
+public class AnalyzerManager {
+
+    private final static String GENERAL = "general";
+    private static final String ALPHA_IDEOGRAPH = "alpha";
+    private static final String COMMON_TOKENS = "common_tokens";
+
+    private final Analyzer generalAnalyzer;
+    private final Analyzer alphaIdeoAnalyzer;
+    private final Analyzer commonTokensAnalyzer;
+
+    private AnalyzerManager(Analyzer generalAnalyzer,
+                            Analyzer alphaIdeoAnalyzer,
+                            Analyzer commonTokensAnalyzer) {
+        this.generalAnalyzer = generalAnalyzer;
+        this.alphaIdeoAnalyzer = alphaIdeoAnalyzer;
+        this.commonTokensAnalyzer = commonTokensAnalyzer;
+    }
+
+    public static AnalyzerManager newInstance() throws IOException {
+        InputStream is = 
AnalyzerManager.class.getClassLoader().getResourceAsStream("lucene-analyzers.json");
+        Reader reader = new InputStreamReader(is, "UTF-8");
+        GsonBuilder builder = new GsonBuilder();
+        builder.registerTypeHierarchyAdapter(Map.class, new 
AnalyzerDeserializer());
+        Gson gson = builder.create();
+        Map<String, Analyzer> map = gson.fromJson(reader, Map.class);
+        Analyzer general = map.get(GENERAL);
+        Analyzer alphaIdeo = map.get(ALPHA_IDEOGRAPH);
+        Analyzer common = map.get(COMMON_TOKENS);
+        if (general == null) {
+            throw new JsonParseException("Must specify "+GENERAL + " 
analyzer");
+        }
+        if (alphaIdeo == null) {
+            throw new JsonParseException("Must specify "+ ALPHA_IDEOGRAPH + " 
analyzer");
+        }
+        if (common == null) {
+            throw new JsonParseException("Must specify "+ COMMON_TOKENS + " 
analyzer");
+        }
+
+        return new AnalyzerManager(general, alphaIdeo, common);
+    }
+
+    /**
+     * This analyzer should be used to extract all tokens.
+     * @return
+     */
+    public Analyzer getGeneralAnalyzer() {
+        return generalAnalyzer;
+    }
+
+    /**
+     * This analyzer is used to extract "alphabetic" tokens.
+     * @return
+     */
+    public Analyzer getAlphaIdeoAnalyzer() {
+        return alphaIdeoAnalyzer;
+    }
+
+    /**
+     * This analyzer should be used to generate common tokens lists from
+     * large corpora.  It is not used by tika-eval in profiling or comparing.
+     * @return
+     */
+    public Analyzer getCommonTokensAnalyzer() {
+        return commonTokensAnalyzer;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
new file mode 100644
index 0000000..31fa866
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
@@ -0,0 +1,74 @@
+package org.apache.tika.eval.tokens;
+
+import java.io.IOException;
+import java.util.Map;
+
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cjk.CJKBigramFilter;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.util.TokenFilterFactory;
+
+/**
+ * Creates a very narrowly focused TokenFilter that limits tokens based on 
length
+ * _unless_ they've been identified as &lt;DOUBLE&gt; or &lt;SINGLE&gt;
+ * by the CJKBigramFilter.
+ *
+ * This class is intended to be used when generating "common tokens" files.
+ */
+public class CJKBigramAwareLengthFilterFactory extends TokenFilterFactory {
+
+
+
+    private final int min;
+    private final int max;
+    public CJKBigramAwareLengthFilterFactory(Map<String, String> args) {
+        super(args);
+        min = Integer.parseInt(args.get("min"));
+        max = Integer.parseInt(args.get("max"));
+    }
+
+    @Override
+    public TokenStream create(TokenStream tokenStream) {
+        return new CJKAwareLengthFilter(tokenStream);
+    }
+
+    private class CJKAwareLengthFilter extends FilteringTokenFilter {
+        private final CharTermAttribute termAtt = 
addAttribute(CharTermAttribute.class);
+        private final TypeAttribute typeAtt = 
addAttribute(TypeAttribute.class);
+
+        public CJKAwareLengthFilter(TokenStream in) {
+            super(in);
+        }
+
+        @Override
+        protected boolean accept() throws IOException {
+            if ( termAtt.length() < min) {
+                String type = typeAtt.type();
+                if (type == CJKBigramFilter.DOUBLE_TYPE || type == 
CJKBigramFilter.SINGLE_TYPE) {
+                    return true;
+                }
+            }
+            return termAtt.length() >= min && termAtt.length() <= max;
+        }
+    }
+
+    /*
+    private static boolean isCJ(int codePoint) {
+        if (
+                (codePoint >= 0x4E00 && codePoint <= 0x9FFF) ||
+                        ( codePoint >= 0x3400 && codePoint <= 0x4dbf) ||
+                        ( codePoint >= 0x20000 && codePoint <= 0x2a6df) ||
+                        ( codePoint >= 0x2A700 && codePoint <= 0x2b73f) ||
+                        ( codePoint >= 0x2B740 && codePoint <= 0x2B81F) ||
+                        ( codePoint >= 0x2B820 && codePoint <- 0x2CEAF) ||
+                        ( codePoint >= 0xF900 && codePoint <= 0xFAFF) ||
+                        ( codePoint >= 0x2F800 && codePoint <= 0x2Fa1F)
+        ) {
+            return true;
+        }
+        return false;
+    }*/
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
new file mode 100644
index 0000000..b74daa1
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.tokens;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CommonTokenCountManager {
+
+    private final static Charset COMMON_TOKENS_CHARSET = 
StandardCharsets.UTF_8;
+    private final static Logger LOGGER = 
LoggerFactory.getLogger(CommonTokenCountManager.class);
+
+    private final Path commonTokensDir;
+
+    Map<String, Set<String>> commonTokenMap = new ConcurrentHashMap<>();
+    Set<String> alreadyTriedToLoad = new HashSet<>();
+
+    //if we have no model or if no langid is passed in
+    //make this configurable
+    String defaultLangCode = "en";
+
+    public CommonTokenCountManager(Path commonTokensDir) throws IOException {
+        this.commonTokensDir = commonTokensDir;
+        tryToLoad(defaultLangCode);
+        //if you couldn't load it, make sure to add an empty
+        //set to prevent npes later
+        Set<String> set = commonTokenMap.get(defaultLangCode);
+        if (set == null) {
+            LOGGER.warn("No common tokens for default language: 
'"+defaultLangCode+"'");
+            commonTokenMap.put(defaultLangCode, new HashSet<String>());
+        }
+    }
+
+    public CommonTokenResult countTokenOverlaps(String langCode,
+                                                Map<String, MutableInt> 
tokens) throws IOException {
+        String actualLangCode = getActualLangCode(langCode);
+        int overlap = 0;
+        Set<String> commonTokens = commonTokenMap.get(actualLangCode);
+        for (Map.Entry<String, MutableInt> e : tokens.entrySet()) {
+            if (commonTokens.contains(e.getKey())) {
+                overlap += e.getValue().intValue();
+            }
+        }
+        return new CommonTokenResult(actualLangCode, overlap);
+    }
+
+    //return langcode for lang that you are actually using
+    //lazily load the appropriate model
+    private String getActualLangCode(String langCode) {
+        if (langCode == null || "".equals(langCode)) {
+            return defaultLangCode;
+        }
+        if (commonTokenMap.containsKey(langCode)) {
+            return langCode;
+        }
+        tryToLoad(langCode);
+        Set<String> set = commonTokenMap.get(langCode);
+        if (set == null) {
+            return defaultLangCode;
+        }
+        return langCode;
+
+    }
+
+    public void close() throws IOException {
+        commonTokenMap.clear();
+    }
+
+    private synchronized void tryToLoad(String langCode) {
+        if (alreadyTriedToLoad.contains(langCode)) {
+            return;
+        }
+        //check once more now that we're in a
+        //synchronized block
+        if (commonTokenMap.get(langCode) != null) {
+            return;
+        }
+        Path p = commonTokensDir.resolve(langCode);
+        if (!Files.isRegularFile(p)) {
+            LOGGER.warn("Couldn't find common tokens file for: '"+langCode+"': 
"+
+            p.toAbsolutePath());
+            alreadyTriedToLoad.add(langCode);
+            return;
+        }
+
+        Set<String> set = commonTokenMap.get(langCode);
+        if (set == null) {
+            set = new HashSet<>();
+            commonTokenMap.put(langCode, set);
+        }
+        try (BufferedReader reader = Files.newBufferedReader(p, 
COMMON_TOKENS_CHARSET)) {
+            alreadyTriedToLoad.add(langCode);
+            String line = reader.readLine();
+            while (line != null) {
+                line = line.trim();
+                if (line.startsWith("#")) {
+                    line = reader.readLine();
+                    continue;
+                }
+                //allow language models with, e.g. tab-delimited counts after 
the term
+                String[] cols = line.split("\t");
+                String t = cols[0].trim();
+                if (t.length() > 0) {
+                    set.add(t);
+                }
+
+                line = reader.readLine();
+            }
+        } catch (IOException e) {
+            LOGGER.warn("IOException trying to read: '"+langCode+"'");
+        }
+    }
+
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
new file mode 100644
index 0000000..340ceca
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.eval.tokens;
+
+
+public class CommonTokenResult {
+
+    private final String langCode;
+    private final int tokens;
+
+    public CommonTokenResult(String langCode, int tokens) {
+        this.langCode = langCode;
+        this.tokens = tokens;
+    }
+
+    public String getLangCode() {
+        return langCode;
+    }
+
+    public int getTokens() {
+        return tokens;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/ContrastStatistics.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/ContrastStatistics.java 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/ContrastStatistics.java
new file mode 100644
index 0000000..dd1e49d
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/ContrastStatistics.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.tokens;
+
+public class ContrastStatistics {
+
+
+    double diceCoefficient;
+    double overlap;
+
+    TokenIntPair[] topNUniqueA;
+    TokenIntPair[] topNUniqueB;
+    TokenIntPair[] topNMoreA;
+    TokenIntPair[] topNMoreB;
+
+     void setDiceCoefficient(double diceCoefficient) {
+        this.diceCoefficient = diceCoefficient;
+    }
+
+     void setOverlap(double overlap) {
+        this.overlap = overlap;
+    }
+
+     void setTopNUniqueA(TokenIntPair[] topNUniqueA) {
+        this.topNUniqueA = topNUniqueA;
+    }
+
+     void setTopNUniqueB(TokenIntPair[] topNUniqueB) {
+        this.topNUniqueB = topNUniqueB;
+    }
+
+     void setTopNMoreA(TokenIntPair[] topNMoreA) {
+        this.topNMoreA = topNMoreA;
+    }
+
+     void setTopNMoreB(TokenIntPair[] topNMoreB) {
+        this.topNMoreB = topNMoreB;
+    }
+
+    public double getDiceCoefficient() {
+        return diceCoefficient;
+    }
+
+    public double getOverlap() {
+        return overlap;
+    }
+
+    public TokenIntPair[] getTopNUniqueA() {
+        return topNUniqueA;
+    }
+
+    public TokenIntPair[] getTopNUniqueB() {
+        return topNUniqueB;
+    }
+
+    public TokenIntPair[] getTopNMoreA() {
+        return topNMoreA;
+    }
+
+    public TokenIntPair[] getTopNMoreB() {
+        return topNMoreB;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenContraster.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenContraster.java 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenContraster.java
new file mode 100644
index 0000000..6cbbbc7
--- /dev/null
+++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenContraster.java
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.tokens;
+
+import java.util.Map;
+
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Computes some corpus contrast statistics.
+ *
+ * Not thread safe.
+ */
+public class TokenContraster {
+
+    private TokenStatistics tokenStatisticsA;
+    private TokenStatistics tokenStatisticsB;
+
+    private TokenCountPriorityQueue uniqA;
+    private TokenCountPriorityQueue uniqB;
+
+    private TokenCountDiffQueue moreA;
+    private TokenCountDiffQueue moreB;
+
+
+    private int topN = 10;
+
+    private int diceCoefficientNum = 0;
+    private int overlapNum = 0;
+
+    private double diceCoefficient = 0.0d;
+    private double overlap = 0.0;
+
+
+    public ContrastStatistics calculateContrastStatistics(Map<String, 
MutableInt> mapA,
+                                                          TokenStatistics 
tokenStatisticsA,
+                                                          Map<String, 
MutableInt> mapB,
+                                                          TokenStatistics 
tokenStatisticsB) {
+        reset();
+        this.tokenStatisticsA = tokenStatisticsA;
+        this.tokenStatisticsB = tokenStatisticsB;
+
+        for (Map.Entry<String, MutableInt> e : mapA.entrySet()) {
+            MutableInt bVal = mapB.get(e.getKey());
+            int b = (bVal == null) ? 0 : bVal.intValue();
+            add(e.getKey(), e.getValue().intValue(), b);
+        }
+
+        for (Map.Entry<String, MutableInt> e : mapB.entrySet()) {
+            if (mapA.containsKey(e.getKey())) {
+                continue;
+            }
+            add(e.getKey(), 0, e.getValue().intValue());
+        }
+        finishComputing();
+        ContrastStatistics contrastStatistics = new ContrastStatistics();
+        contrastStatistics.setDiceCoefficient(diceCoefficient);
+        contrastStatistics.setOverlap(overlap);
+        contrastStatistics.setTopNUniqueA(uniqA.getArray());
+        contrastStatistics.setTopNUniqueB(uniqB.getArray());
+        contrastStatistics.setTopNMoreA(moreA.getArray());
+        contrastStatistics.setTopNMoreB(moreB.getArray());
+        return contrastStatistics;
+    }
+
+    private void reset() {
+        this.uniqA = new TokenCountPriorityQueue(topN);
+        this.uniqB = new TokenCountPriorityQueue(topN);
+        this.moreA = new TokenCountDiffQueue(topN);
+        this.moreB = new TokenCountDiffQueue(topN);
+        diceCoefficientNum = 0;
+        overlapNum = 0;
+        diceCoefficient = 0.0d;
+        overlap = 0.0;
+
+    }
+    private void add(String token, int tokenCountA, int tokenCountB) {
+        if (tokenCountA > 0 && tokenCountB > 0) {
+            diceCoefficientNum += 2;
+            overlapNum += 2 * Math.min(tokenCountA, tokenCountB);
+        }
+
+
+        if (tokenCountA == 0L && tokenCountB > 0L) {
+            addToken(token, tokenCountB, uniqB);
+        }
+        if (tokenCountB == 0L && tokenCountA > 0L) {
+            addToken(token, tokenCountA, uniqA);
+        }
+
+        if (tokenCountA > tokenCountB) {
+            addTokenDiff(token, tokenCountA, tokenCountA-tokenCountB, moreA);
+        } else if (tokenCountB > tokenCountA) {
+            addTokenDiff(token, tokenCountB, tokenCountB-tokenCountA, moreB);
+
+        }
+
+    }
+
+    private void finishComputing() {
+
+        long sumUniqTokens = tokenStatisticsA.getTotalUniqueTokens()
+                +tokenStatisticsB.getTotalUniqueTokens();
+
+        diceCoefficient = (double) diceCoefficientNum / (double) sumUniqTokens;
+        overlap = (float) overlapNum / (double) 
(tokenStatisticsA.getTotalTokens() +
+                tokenStatisticsB.getTotalTokens());
+
+    }
+
+    private void addTokenDiff(String token, int tokenCount, int diff, 
TokenCountDiffQueue queue) {
+        if (queue.top() == null || queue.size() < topN ||
+                diff >= queue.top().diff) {
+            queue.insertWithOverflow(new TokenCountDiff(token, diff, 
tokenCount));
+        }
+
+    }
+
+    private void addToken(String token, int tokenCount, 
TokenCountPriorityQueue queue) {
+        if (queue.top() == null || queue.size() < topN ||
+                tokenCount >= queue.top().getValue()) {
+            queue.insertWithOverflow(new TokenIntPair(token, tokenCount));
+        }
+
+    }
+
+    class TokenCountDiffQueue extends PriorityQueue<TokenCountDiff> {
+
+        TokenCountDiffQueue(int maxSize) {
+            super(maxSize);
+        }
+
+        @Override
+        protected boolean lessThan(TokenCountDiff arg0, TokenCountDiff arg1) {
+            if (arg0.diff < arg1.diff) {
+                return true;
+            } else if (arg0.diff > arg1.diff) {
+                return false;
+            }
+            return arg1.token.compareTo(arg0.token) < 0;
+        }
+
+        public TokenIntPair[] getArray() {
+            TokenIntPair[] topN = new TokenIntPair[size()];
+            //now we reverse the queue
+            TokenCountDiff token = pop();
+            int i = topN.length-1;
+            while (token != null && i > -1) {
+                topN[i--] = new TokenIntPair(token.token, token.diff);
+                token = pop();
+            }
+            return topN;
+        }
+    }
+
+    private class TokenCountDiff {
+        private final String token;
+        private final int diff;
+        private final int count;
+
+        private TokenCountDiff(String token, int diff, int count) {
+            this.token = token;
+            this.diff = diff;
+            this.count = count;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCountPriorityQueue.java
----------------------------------------------------------------------
diff --git 
a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCountPriorityQueue.java
 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCountPriorityQueue.java
new file mode 100644
index 0000000..624da0c
--- /dev/null
+++ 
b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCountPriorityQueue.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.eval.tokens;
+
+import org.apache.lucene.util.PriorityQueue;
+
+public class TokenCountPriorityQueue extends PriorityQueue<TokenIntPair> {
+
+    TokenCountPriorityQueue(int maxSize) {
+        super(maxSize);
+    }
+
+    @Override
+    protected boolean lessThan(TokenIntPair arg0, TokenIntPair arg1) {
+        if (arg0.getValue() < arg1.getValue()) {
+            return true;
+        } else if (arg0.getValue() > arg1.getValue()) {
+            return false;
+        }
+        return arg1.token.compareTo(arg0.token) < 0;
+    }
+
+    public TokenIntPair[] getArray() {
+        TokenIntPair[] topN = new TokenIntPair[size()];
+        //now we reverse the queue
+        TokenIntPair term = pop();
+        int i = topN.length-1;
+        while (term != null && i > -1) {
+            topN[i--] = term;
+            term = pop();
+        }
+        return topN;
+    }
+}
\ No newline at end of file

[4/6] tika git commit: TIKA-1332 -- initial commit for tika-eval module. More work remains.

Reply via email to