http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java new file mode 100644 index 0000000..cd90f76 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java @@ -0,0 +1,161 @@ +package org.apache.tika.eval.io; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.commons.compress.compressors.z.ZCompressorInputStream; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.serialization.JsonMetadataList; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +public class ExtractReader { + + public enum ALTER_METADATA_LIST { + AS_IS, //leave the metadata list as is + FIRST_ONLY, //take only the metadata list for the "container" document + CONCATENATE_CONTENT_INTO_FIRST // concatenate all of the content into the first + } + private final static Logger LOGGER = LoggerFactory.getLogger(ExtractReader.class); + TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); + + public List<Metadata> loadExtract(Path thisFile, ALTER_METADATA_LIST alterExtractList) { + List<Metadata> metadataList = null; + if (thisFile == null || !Files.isRegularFile(thisFile)) { + return metadataList; + } + Reader reader = null; + InputStream is = null; + FileSuffixes fileSuffixes = parseSuffixes(thisFile.getFileName().toString()); + if (fileSuffixes.txtOrJson == null) { + LOGGER.warn("file must end with .txt or .json: "+thisFile.getFileName().toString()); + return metadataList; + } + + try { + is = Files.newInputStream(thisFile); + if (fileSuffixes.compression != null) { + if (fileSuffixes.compression.equals("bz2")) { + is = new BZip2CompressorInputStream(is); + } else if (fileSuffixes.compression.equals("gz")) { + is = new GzipCompressorInputStream(is); + } else if (fileSuffixes.compression.equals("zip")) { + is = new ZCompressorInputStream(is); + } else { + LOGGER.warn("Can't yet process compression of type: "+fileSuffixes.compression); + } + } + reader = new BufferedReader(new InputStreamReader(is, "UTF-8")); + + if (fileSuffixes.txtOrJson.equals("json")) { + metadataList = JsonMetadataList.fromJson(reader); + if (alterExtractList.equals(ALTER_METADATA_LIST.FIRST_ONLY) && metadataList.size() > 1) { + while (metadataList.size() > 1) { + metadataList.remove(metadataList.size()-1); + } + } else if (alterExtractList.equals(ALTER_METADATA_LIST.AS_IS.CONCATENATE_CONTENT_INTO_FIRST) && + metadataList.size() > 1) { + StringBuilder sb = new StringBuilder(); + Metadata containerMetadata = metadataList.get(0); + for (int i = 0; i < metadataList.size(); i++) { + Metadata m = metadataList.get(i); + String c = m.get(RecursiveParserWrapper.TIKA_CONTENT); + if (c != null) { + sb.append(c); + sb.append(" "); + } + } + containerMetadata.set(RecursiveParserWrapper.TIKA_CONTENT, sb.toString()); + while (metadataList.size() > 1) { + metadataList.remove(metadataList.size()-1); + } + } + + + } else { + metadataList = generateListFromTextFile(reader, fileSuffixes); + } + } catch (IOException e) { + LOGGER.warn("couldn't open:" + thisFile.toAbsolutePath(), e); + } catch (TikaException e) { + LOGGER.warn("couldn't open:" + thisFile.toAbsolutePath(), e); + } finally { + IOUtils.closeQuietly(reader); + IOUtils.closeQuietly(is); + } + return metadataList; + } + + private List<Metadata> generateListFromTextFile(Reader reader, + FileSuffixes fileSuffixes) throws IOException { + List<Metadata> metadataList = new ArrayList<>(); + String content = IOUtils.toString(reader); + Metadata m = new Metadata(); + m.set(RecursiveParserWrapper.TIKA_CONTENT, content); + //Let's hope the file name has a suffix that can + //be used to determine the mime. Could be wrong or missing, + //but better than nothing. + m.set(Metadata.RESOURCE_NAME_KEY, fileSuffixes.originalFileName); + + MediaType mimeType = tikaConfig.getMimeRepository().detect(null, m); + if (mimeType != null) { + m.set(Metadata.CONTENT_TYPE, mimeType.toString()); + } + metadataList.add(m); + return metadataList; + + } + + protected static FileSuffixes parseSuffixes(String fName) { + FileSuffixes fileSuffixes = new FileSuffixes(); + if (fName == null) { + return fileSuffixes; + } + Matcher m = Pattern.compile("^(.*?)\\.(json|txt)(?:\\.(bz2|gz(?:ip)?|zip))?$").matcher(fName); + if (m.find()) { + fileSuffixes.originalFileName = m.group(1); + fileSuffixes.txtOrJson = m.group(2); + fileSuffixes.compression = m.group(3); + } + return fileSuffixes; + } + + private static class FileSuffixes { + String compression; + String txtOrJson; + String originalFileName; + } +}
http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java b/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java new file mode 100644 index 0000000..b2b76ab --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java @@ -0,0 +1,31 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.io; + + +import java.io.IOException; +import java.util.Map; + +import org.apache.tika.eval.db.Cols; +import org.apache.tika.eval.db.TableInfo; + +public interface IDBWriter { + public void writeRow(TableInfo table, Map<Cols, String> data) throws IOException; + public void close() throws IOException; + public int getMimeId(String mimeString); +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java new file mode 100644 index 0000000..2d509f7 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.io; + +import java.io.IOException; +import java.sql.SQLException; + +import org.apache.log4j.Level; + +public interface XMLLogMsgHandler { + public void handleMsg(Level level, String xml) throws IOException, SQLException; +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java new file mode 100644 index 0000000..bb47b77 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.io; + +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamConstants; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.sql.SQLException; + +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.apache.tika.io.IOUtils; +import org.apache.tika.parser.ParseContext; + + +public class XMLLogReader { + + private final static Logger logger = Logger.getLogger(XMLLogReader.class); + //class that wraps a logger's xml output + //into a single xml parseable input stream. + + public void read(InputStream xmlLogFileIs, XMLLogMsgHandler handler) throws XMLStreamException { + InputStream is = new LogXMLWrappingInputStream(xmlLogFileIs); + /* try { + System.out.println("WRAPPED: " + IOUtils.toString(is)+ "<<WRAPPED"); + } catch (IOException e) { + e.printStackTrace(); + }*/ + XMLInputFactory factory = new ParseContext().getXMLInputFactory(); + XMLStreamReader reader = factory.createXMLStreamReader(is); + + Level level = null; + while (reader.hasNext()) { + reader.next(); + switch (reader.getEventType()) { + case XMLStreamConstants.START_ELEMENT : + if ("event".equals(reader.getLocalName())) { + level = Level.toLevel(reader.getAttributeValue("", "level"), Level.DEBUG); + } else if ("message".equals(reader.getLocalName())) { + try { + handler.handleMsg(level, reader.getElementText()); + } catch (IOException e) { + e.printStackTrace(); + logger.warn("Error parsing: "+reader.getElementText()); + } catch (SQLException e) { + e.printStackTrace(); + logger.warn("SQLException: "+e.getMessage()); + } + } + break; + case XMLStreamConstants.END_ELEMENT : + if ("event".equals(reader.getLocalName())) { + level = null; + } else if ("message".equals(reader.getLocalName())) { + //sdo we care any more? + } + break; + }; + } + } + + + + class LogXMLWrappingInputStream extends InputStream { + //plagiarized from log4j's chainsaw + private final static String HEADER = + "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + + "<log4j:eventSet version=\"1.2\" " + + "xmlns:log4j=\"http://jakarta.apache.org/log4j/\">"; + private static final String FOOTER = "</log4j:eventSet>"; + + private InputStream[] streams; + int currentStreamIndex = 0; + + private LogXMLWrappingInputStream(InputStream xmlLogFileIs){ + streams = new InputStream[3]; + streams[0] = new ByteArrayInputStream(HEADER.getBytes(IOUtils.UTF_8)); + streams[1] = xmlLogFileIs; + streams[2] = new ByteArrayInputStream(FOOTER.getBytes(IOUtils.UTF_8)); + + } + + @Override + public int read() throws IOException { + int c = streams[currentStreamIndex].read(); + if (c < 0) { + IOUtils.closeQuietly(streams[currentStreamIndex]); + while (currentStreamIndex < streams.length-1) { + currentStreamIndex++; + int tmpC = streams[currentStreamIndex].read(); + if (tmpC < 0) { + IOUtils.closeQuietly(streams[currentStreamIndex]); + } else { + return tmpC; + } + } + return -1; + } + return c; + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java new file mode 100644 index 0000000..b7e2c09 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java @@ -0,0 +1,197 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.reports; + +import java.io.IOException; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.Connection; +import java.sql.ResultSet; +import java.sql.ResultSetMetaData; +import java.sql.SQLException; +import java.sql.Statement; +import java.sql.Types; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.log4j.Logger; +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.CellStyle; +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.VerticalAlignment; +import org.apache.poi.xssf.streaming.SXSSFSheet; +import org.apache.poi.xssf.streaming.SXSSFWorkbook; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +/** + * This class represents a single report. + */ +public class Report { + + static final Logger logger = Logger.getLogger(Report.class); + + final String NULL_VALUE = "";//TODO: make this configurable!!! + Map<String, XSLXCellFormatter> cellFormatters = new HashMap<>(); + private XLSXNumFormatter defaultDoubleFormatter = new XLSXNumFormatter("0.000"); + private XLSXNumFormatter defaultIntegerFormatter = new XLSXNumFormatter("0"); + private CellStyle sqlCellStyle; + + String sql; + String reportFilename; + boolean includeSql = true; + + String reportName; + + public void writeReport(Connection c, Path reportsRoot) throws SQLException, IOException { + logger.info("Writing report: "+reportName + " to "+reportFilename); + dumpXLSX(c, reportsRoot); + } + + private void dumpXLSX(Connection c, Path reportsRoot) throws IOException, SQLException { + Statement st = c.createStatement(); + Path out = reportsRoot.resolve(reportFilename); + Files.createDirectories(out.getParent()); + + SXSSFWorkbook wb = new SXSSFWorkbook(new XSSFWorkbook(), 100, true, true); + wb.setCompressTempFiles(true); + defaultIntegerFormatter.reset(wb.getXSSFWorkbook()); + defaultDoubleFormatter.reset(wb.getXSSFWorkbook()); + sqlCellStyle = wb.createCellStyle(); + sqlCellStyle.setVerticalAlignment(VerticalAlignment.TOP); + sqlCellStyle.setWrapText(true); + + + try { + dumpReportToWorkbook(st, wb); + } finally { + try (OutputStream os = Files.newOutputStream(out)) { + wb.write(os); + } finally { + wb.dispose(); + } + } + } + + private void dumpReportToWorkbook(Statement st, SXSSFWorkbook wb) throws IOException, SQLException { + ResultSet rs = st.executeQuery(sql); + + SXSSFSheet sheet = wb.createSheet("tika-eval Report"); + sheet.trackColumnForAutoSizing(0); + + int rowCount = 0; + ResultSetMetaData meta = rs.getMetaData(); + Set<String> colNames = new HashSet<>(); + + Row xssfRow = sheet.createRow(rowCount++); + //write headers and cache them to check against styles + for (int i = 1; i <= meta.getColumnCount(); i++) { + Cell cell = xssfRow.createCell(i-1); + cell.setCellValue(meta.getColumnLabel(i)); + colNames.add(meta.getColumnLabel(i)); + } + + ResultSetMetaData resultSetMetaData = rs.getMetaData(); + while (rs.next()) { + xssfRow = sheet.createRow(rowCount++); + for (int i = 1; i <= meta.getColumnCount(); i++) { + Cell cell = xssfRow.createCell(i-1); + XSLXCellFormatter formatter = cellFormatters.get(meta.getColumnLabel(i)); + if (formatter == null) { + formatter = getDefaultFormatter(resultSetMetaData.getColumnType(i)); + } + if (formatter != null) { + formatter.applyStyleAndValue(i, rs, cell); + } else { + writeCell(meta, i, rs, cell); + } + } + } + sheet.autoSizeColumn(0); + + if (!includeSql) { + return; + } + + SXSSFSheet sqlSheet = wb.createSheet("tika-eval SQL"); + sqlSheet.setColumnWidth(0, 100*250); + Row sqlRow = sqlSheet.createRow(0); + short height = 5000; + sqlRow.setHeight(height); + Cell cell = sqlRow.createCell(0); + cell.setCellStyle(sqlCellStyle); + + cell.setCellValue(sql.trim());//.replaceAll("[\r\n]+", "\r\n")); + } + + private XSLXCellFormatter getDefaultFormatter(int columnType) { + switch (columnType) { + case Types.INTEGER : + return defaultIntegerFormatter; + case Types.DOUBLE: + case Types.FLOAT: + case Types.DECIMAL: + return defaultDoubleFormatter; + default: + return null; + } + } + + private void writeCell(ResultSetMetaData meta, int colIndex, ResultSet rs, + Cell cell) throws SQLException { + + switch(meta.getColumnType(colIndex)) { + //fall through on numerics + case Types.BIGINT: + case Types.SMALLINT: + case Types.INTEGER: + case Types.DOUBLE: + case Types.FLOAT: + case Types.DECIMAL: + case Types.NUMERIC: + double dbl = rs.getDouble(colIndex); + if (rs.wasNull()) { + cell.setCellValue(NULL_VALUE); + } else { + cell.setCellValue(dbl); + } + break; + //fall through strings + case Types.CHAR: + case Types.VARCHAR: + case Types.LONGNVARCHAR: + String val = rs.getString(colIndex); + if (rs.wasNull()) { + cell.setCellValue(NULL_VALUE); + } else { + cell.setCellValue(val); + } + break; + default: + if (rs.wasNull()) { + cell.setCellValue(NULL_VALUE); + } else { + cell.setCellValue(rs.getString(colIndex)); + } + logger.warn("Couldn't find type for: " + meta.getColumnType(colIndex) + + ". Defaulting to String"); + } + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java new file mode 100644 index 0000000..97b7711 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java @@ -0,0 +1,295 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.reports; + + +import javax.xml.parsers.DocumentBuilder; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.sql.Connection; +import java.sql.DatabaseMetaData; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; +import org.apache.poi.common.usermodel.Hyperlink; +import org.apache.tika.eval.ExtractComparer; +import org.apache.tika.eval.ExtractProfiler; +import org.apache.tika.eval.db.DBUtil; +import org.apache.tika.eval.db.H2Util; +import org.apache.tika.parser.ParseContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +public class ResultsReporter { + + protected static Logger LOGGER = LoggerFactory.getLogger(ResultsReporter.class); + + private static Options OPTIONS; + + static { + OPTIONS = new Options(); + + Option db = new Option("db", "database"); + db.setRequired(true); + db.setArgs(1); + + OPTIONS.addOption(db) + .addOption("rd", "reportsDir", true, "directory for the reports. " + + "If not specified, will write to 'reports'" + + "BEWARE: Will overwrite existing reports without warning!" + ) + .addOption("rf", "reportsFile", true, "xml specifying sql to call for the reports." + + "If not specified, will use default reports in resources/tika-eval-*-config.xml"); + + } + + public static void USAGE() { + HelpFormatter helpFormatter = new HelpFormatter(); + helpFormatter.printHelp( + 80, + "java -jar tika-eval-x.y.jar Report -db mydb [-rd myreports] [-rf myreports.xml]", + "Tool: Report", + ResultsReporter.OPTIONS, + "Note: for h2 db, do not include the .mv.db at the end of the db name."); + + } + + + List<String> before = new ArrayList<>(); + List<String> after = new ArrayList<>(); + List<Report> reports = new ArrayList<>(); + + + private void addBefore(String b) { + before.add(b); + } + + private void addAfter(String a) { + after.add(a); + } + + private void addReport(Report r) { + reports.add(r); + } + + public static ResultsReporter build(Path p) throws Exception { + + ResultsReporter r = new ResultsReporter(); + + DocumentBuilder docBuilder = new ParseContext().getDocumentBuilder(); + Document doc; + try(InputStream is = Files.newInputStream(p)) { + doc = docBuilder.parse(is); + } + Node docElement = doc.getDocumentElement(); + assert(docElement.getNodeName().equals("reports")); + NodeList children = docElement.getChildNodes(); + for (int i = 0; i < children.getLength(); i++) { + Node n = children.item(i); + if ("before".equals(n.getNodeName())) { + for (String before : getSql(n)) { + r.addBefore(before); + } + } else if ("after".equals(n.getNodeName())) { + for (String after : getSql(n)) { + r.addAfter(after); + } + } else if ("report".equals(n.getNodeName())) { + Report report = buildReport(n); + r.addReport(report); + } + } + + return r; + } + + private static Report buildReport(Node n) { + NodeList children = n.getChildNodes(); + Report r = new Report(); + NamedNodeMap attrs = n.getAttributes(); + + r.includeSql = Boolean.parseBoolean(attrs.getNamedItem("includeSql").getNodeValue()); + r.reportFilename = attrs.getNamedItem("reportFilename").getNodeValue(); + r.reportName = attrs.getNamedItem("reportName").getNodeValue(); + + for (int i = 0; i < children.getLength(); i++) { + Node child = children.item(i); + if (child.getNodeType() != 1) { + continue; + } + if ("sql".equals(child.getNodeName())) { + if (r.sql != null) { + throw new IllegalArgumentException("Can only have one sql statement per report"); + } + r.sql = child.getTextContent(); + } else if ("colformats".equals(child.getNodeName())) { + r.cellFormatters = getCellFormatters(child); + } else { + throw new IllegalArgumentException("Not expecting to see:"+child.getNodeName()); + } + } + return r; + } + + private static Map<String, XSLXCellFormatter> getCellFormatters(Node n) { + NodeList children = n.getChildNodes(); + Map<String, XSLXCellFormatter> ret = new HashMap<>(); + for (int i = 0; i < children.getLength(); i++) { + Node child = children.item(i); + if (child.getNodeType() != 1) { + continue; + } + NamedNodeMap attrs = child.getAttributes(); + String columnName = attrs.getNamedItem("name").getNodeValue(); + assert(!ret.containsKey(columnName)); + String type = attrs.getNamedItem("type").getNodeValue(); + if ("numberFormatter".equals(type)) { + String format = attrs.getNamedItem("format").getNodeValue(); + XSLXCellFormatter f = new XLSXNumFormatter(format); + ret.put(columnName,f); + } else if ("urlLink".equals(type)) { + String base = ""; + Node baseNode = attrs.getNamedItem("base"); + if (baseNode != null) { + base = baseNode.getNodeValue(); + } + XLSXHREFFormatter f = new XLSXHREFFormatter(base, Hyperlink.LINK_URL); + ret.put(columnName, f); + } else if ("fileLink".equals(type)) { + String base = ""; + Node baseNode = attrs.getNamedItem("base"); + if (baseNode != null) { + base = baseNode.getNodeValue(); + } + XLSXHREFFormatter f = new XLSXHREFFormatter(base, Hyperlink.LINK_FILE); + ret.put(columnName, f); + } + } + return ret; + } + + private static List<String> getSql(Node n) { + List<String> ret = new ArrayList<>(); + + NodeList children = n.getChildNodes(); + + for (int i = 0; i < children.getLength(); i++) { + Node child = children.item(i); + if (child.getNodeType() != 1) { + continue; + } + ret.add(child.getTextContent()); + } + return ret; + } + + public static void main(String[] args) throws Exception { + + DefaultParser defaultCLIParser = new DefaultParser(); + CommandLine commandLine = null; + try { + commandLine = defaultCLIParser.parse(OPTIONS, args); + } catch (ParseException e) { + System.out.println(e.getMessage()); + USAGE(); + return; + } + Path db = Paths.get(commandLine.getOptionValue("db")); + DBUtil dbUtil = new H2Util(db); + + try (Connection c = dbUtil.getConnection(true)) { + Path tmpReportsFile = null; + try { + ResultsReporter resultsReporter = null; + String reportsFile = commandLine.getOptionValue("rf"); + if (reportsFile == null) { + tmpReportsFile = getDefaultReportsConfig(c); + resultsReporter = ResultsReporter.build(tmpReportsFile); + } else { + resultsReporter = ResultsReporter.build(Paths.get(reportsFile)); + } + + Path reportsRootDirectory = Paths.get(commandLine.getOptionValue("rd", "reports")); + if (Files.isDirectory(reportsRootDirectory)) { + LOGGER.warn("'Reports' directory exists. Will overwrite existing reports."); + } + + resultsReporter.execute(c, reportsRootDirectory); + } finally { + if (tmpReportsFile != null) { + Files.delete(tmpReportsFile); + } + } + } + } + + private static Path getDefaultReportsConfig(Connection c) throws IOException, SQLException { + DatabaseMetaData md = c.getMetaData(); + String internalPath = null; + try (ResultSet rs = md.getTables(null, null, "%", null)) { + while (rs.next()) { + String tName = rs.getString(3); + if (ExtractComparer.CONTENTS_TABLE_B.getName().equalsIgnoreCase(tName)) { + internalPath = "/comparison-reports.xml"; + break; + } else if (ExtractProfiler.PROFILE_TABLE.getName().equalsIgnoreCase(tName)) { + internalPath = "/profile-reports.xml"; + break; + } + } + } + + if (internalPath == null) { + throw new RuntimeException("Couldn't determine if this database was a 'profiler' or 'comparison' db"); + } + Path tmp = Files.createTempFile("tmp-tika-reports", ".xml"); + Files.copy(ResultsReporter.class.getResourceAsStream(internalPath), tmp, StandardCopyOption.REPLACE_EXISTING); + return tmp; + } + + public void execute(Connection c, Path reportsDirectory) throws IOException, SQLException { + Statement st = c.createStatement(); + for (String sql : before) { + st.execute(sql); + } + for (Report r : reports) { + r.writeReport(c, reportsDirectory); + } + for (String sql : after) { + st.execute(sql); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java new file mode 100644 index 0000000..52d55e4 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.reports; + +import java.nio.file.Paths; +import java.sql.ResultSet; +import java.sql.SQLException; + +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.CellStyle; +import org.apache.poi.ss.usermodel.Font; +import org.apache.poi.ss.usermodel.Hyperlink; +import org.apache.poi.ss.usermodel.IndexedColors; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +/** + * Created by TALLISON on 4/20/2016. + */ +public class XLSXHREFFormatter implements XSLXCellFormatter { + //xlsx files can only have this many hyperlinks + //if they have more Excel can't read the file + private static final int MAX_HYPERLINKS = 65000; + + + private final String urlBase; + private final int linkType; + private XSSFWorkbook workbook; + private CellStyle style; + private int links = 0; + + public XLSXHREFFormatter(String urlBase, + int linkType) { + this.urlBase = urlBase; + this.linkType = linkType; + } + + @Override + public void reset(XSSFWorkbook workbook) { + this.workbook = workbook; + style = workbook.createCellStyle(); + Font hlinkFont = workbook.createFont(); + hlinkFont.setUnderline(Font.U_SINGLE); + hlinkFont.setColor(IndexedColors.BLUE.getIndex()); + style.setFont(hlinkFont); + links = 0; + + } + + @Override + public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException { + if (links < MAX_HYPERLINKS) { + Hyperlink hyperlink = workbook.getCreationHelper().createHyperlink(linkType); + String path = resultSet.getString(dbColNum); + String address = urlBase+path; + hyperlink.setAddress(address); + cell.setHyperlink(hyperlink); + cell.setCellStyle(style); + String fName = Paths.get(path).getFileName().toString(); + cell.setCellValue(fName); + links++; + } else { + //silently stop adding hyperlinks + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java new file mode 100644 index 0000000..b2ba5b0 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.reports; + +import java.sql.ResultSet; +import java.sql.SQLException; + +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.CellStyle; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +class XLSXNumFormatter implements XSLXCellFormatter { + + private final String formatString; + private CellStyle style; + + XLSXNumFormatter(String formatString) { + this.formatString = formatString; + } + + + @Override + public void reset(XSSFWorkbook workbook) { + style = workbook.createCellStyle(); + style.setDataFormat(workbook.getCreationHelper() + .createDataFormat().getFormat(formatString)); + } + + @Override + public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException { + double d = resultSet.getDouble(dbColNum); + if (resultSet.wasNull()) { + + } else { + cell.setCellStyle(style); + } + cell.setCellValue(resultSet.getDouble(dbColNum)); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java b/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java new file mode 100644 index 0000000..eb0e024 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.reports; + + +import java.sql.ResultSet; +import java.sql.SQLException; + +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.xssf.usermodel.XSSFWorkbook; + +interface XSLXCellFormatter { + + public void reset(XSSFWorkbook workbook); + public void applyStyleAndValue(int dbColNum, ResultSet resultSet, Cell cell) throws SQLException; +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java new file mode 100644 index 0000000..fb72e84 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.tokens; + +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.analysis.FilteringTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Factory for filter that only allows tokens with characters that "isAlphabetic" or "isIdeographic" through. + */ +public class AlphaIdeographFilterFactory extends TokenFilterFactory { + + + + public AlphaIdeographFilterFactory(Map<String, String> args) { + super(args); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new AlphaFilter(tokenStream); + } + + /** + * Remove tokens tokens that do not contain an " + */ + private class AlphaFilter extends FilteringTokenFilter { + + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + public AlphaFilter(TokenStream in) { + super(in); + } + + @Override + protected boolean accept() throws IOException { + char[] buff = termAtt.buffer(); + for (int i = 0; i < termAtt.length(); i++) { + int cp = buff[i]; + if (Character.isHighSurrogate(buff[i])) { + if (i < termAtt.length()-1) { + cp = Character.toCodePoint(buff[i], buff[i + 1]); + i++; + } + } + + if (Character.isAlphabetic(cp) || + Character.isIdeographic(cp)) { + return true; + } + } + return false; + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java new file mode 100644 index 0000000..cfc2d4f --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.tokens; + + +import java.io.IOException; +import java.io.Reader; +import java.lang.reflect.Type; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import com.google.gson.JsonArray; +import com.google.gson.JsonDeserializationContext; +import com.google.gson.JsonDeserializer; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParseException; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.util.CharFilterFactory; +import org.apache.lucene.analysis.util.ClasspathResourceLoader; +import org.apache.lucene.analysis.util.ResourceLoaderAware; +import org.apache.lucene.analysis.util.TokenFilterFactory; +import org.apache.lucene.analysis.util.TokenizerFactory; + +class AnalyzerDeserializer implements JsonDeserializer<Map<String, Analyzer>> { + + + private static String ANALYZERS = "analyzers"; + private static String CHAR_FILTERS = "charfilters"; + private static String TOKEN_FILTERS = "tokenfilters"; + private static String TOKENIZER = "tokenizer"; + private static String FACTORY = "factory"; + private static String PARAMS = "params"; + + @Override + public Map<String, Analyzer> deserialize(JsonElement element, Type type, + JsonDeserializationContext jsonDeserializationContext) throws JsonParseException { + if (! element.isJsonObject()) { + throw new IllegalArgumentException("Expecting top level 'analyzers:{}'"); + } + + JsonElement root = element.getAsJsonObject().get(ANALYZERS); + if (root == null) { + throw new IllegalArgumentException("Expecting top level 'analyzers:{}"); + } + try { + return buildAnalyzers(root); + } catch (IOException e) { + throw new RuntimeException(e); + } + + } + + public static Map<String, Analyzer> buildAnalyzers(JsonElement value) throws IOException { + if (! value.isJsonObject()) { + throw new IllegalArgumentException("Expecting map with analyzer names/analyzer definitions"); + } + Map<String, Analyzer> analyzers = new HashMap<>(); + JsonObject root = (JsonObject)value; + for (Map.Entry<String, JsonElement> e : root.entrySet()) { + String analyzerName = e.getKey(); + Analyzer analyzer = buildAnalyzer(analyzerName, e.getValue()); + analyzers.put(analyzerName, analyzer); + } + return analyzers; + } + + public static Analyzer buildAnalyzer(String analyzerName, JsonElement value) throws IOException { + if (! value.isJsonObject()) { + throw new IllegalArgumentException("Expecting map of charfilter, tokenizer, tokenfilters"); + } + JsonObject aRoot = (JsonObject)value; + CharFilterFactory[] charFilters = new CharFilterFactory[0]; + TokenizerFactory tokenizerFactory = null; + TokenFilterFactory[] tokenFilterFactories = new TokenFilterFactory[0]; + for ( Map.Entry<String, JsonElement> e : aRoot.entrySet()) { + String k = e.getKey(); + if (k.equals(CHAR_FILTERS)) { + charFilters = buildCharFilters(e.getValue(), analyzerName); + } else if (k.equals(TOKEN_FILTERS)) { + tokenFilterFactories = buildTokenFilterFactories(e.getValue(), analyzerName); + } else if (k.equals(TOKENIZER)) { + tokenizerFactory = buildTokenizerFactory(e.getValue(), analyzerName); + } else { + throw new IllegalArgumentException("Should have one of three values here:"+ + CHAR_FILTERS + ", "+ + TOKENIZER+", "+ + TOKEN_FILTERS + + ". I don't recognize: "+k); + } + } + if (tokenizerFactory == null) { + throw new IllegalArgumentException("Must specify at least a tokenizer factory for an analyzer!"); + } + return new MyTokenizerChain(charFilters, tokenizerFactory, tokenFilterFactories); + } + + private static TokenizerFactory buildTokenizerFactory(JsonElement map, String analyzerName) throws IOException { + if (!(map instanceof JsonObject)) { + throw new IllegalArgumentException("Expecting a map with \"factory\" string and " + + "\"params\" map in tokenizer factory;"+ + " not: "+map.toString() + " in "+analyzerName); + } + JsonElement factoryEl = ((JsonObject)map).get(FACTORY); + if (factoryEl == null || ! factoryEl.isJsonPrimitive()) { + throw new IllegalArgumentException("Expecting value for factory in char filter factory builder in:"+ + analyzerName); + } + String factoryName = factoryEl.getAsString(); + factoryName = factoryName.startsWith("oala.") ? + factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : factoryName; + + JsonElement paramsEl = ((JsonObject)map).get(PARAMS); + Map<String, String> params = mapify(paramsEl); + String spiName = ""; + for (String s : TokenizerFactory.availableTokenizers()) { + Class clazz = TokenizerFactory.lookupClass(s); + if (clazz.getName().equals(factoryName)) { + spiName = s; + break; + } + } + if (spiName.equals("")) { + throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenizerFactory with name"+ + "'"+factoryName+"' does not exist."); + } + try { + TokenizerFactory tokenizerFactory = TokenizerFactory.forName(spiName, params); + if (tokenizerFactory instanceof ResourceLoaderAware) { + ((ResourceLoaderAware) tokenizerFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class)); + } + + return tokenizerFactory; + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("While working on "+analyzerName, e); + } + } + + private static CharFilterFactory[] buildCharFilters(JsonElement el, String analyzerName) throws IOException { + if (el == null || el.isJsonNull()) { + return null; + } + if (! el.isJsonArray()) { + throw new IllegalArgumentException("Expecting array for charfilters, but got:"+el.toString() + + " for "+analyzerName); + } + JsonArray jsonArray = (JsonArray)el; + List<CharFilterFactory> ret = new LinkedList<CharFilterFactory>(); + for (JsonElement filterMap : jsonArray) { + if (!(filterMap instanceof JsonObject)) { + throw new IllegalArgumentException("Expecting a map with \"factory\" string and \"params\" map in char filter factory;"+ + " not: "+filterMap.toString() + " in "+analyzerName); + } + JsonElement factoryEl = ((JsonObject)filterMap).get(FACTORY); + if (factoryEl == null || ! factoryEl.isJsonPrimitive()) { + throw new IllegalArgumentException( + "Expecting value for factory in char filter factory builder in:"+analyzerName); + } + String factoryName = factoryEl.getAsString(); + factoryName = factoryName.replaceAll("oala.", "org.apache.lucene.analysis."); + + JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS); + Map<String, String> params = mapify(paramsEl); + String spiName = ""; + for (String s : CharFilterFactory.availableCharFilters()) { + Class clazz = CharFilterFactory.lookupClass(s); + if (clazz.getName().equals(factoryName)) { + spiName = s; + break; + } + } + if (spiName.equals("")) { + throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.CharFilterFactory with name"+ + "'"+factoryName+"' does not exist."); + } + + try { + CharFilterFactory charFilterFactory = CharFilterFactory.forName(spiName, params); + if (charFilterFactory instanceof ResourceLoaderAware) { + ((ResourceLoaderAware) charFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class)); + } + ret.add(charFilterFactory); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("While trying to load "+ + analyzerName + ": "+ e.getMessage(), e); + } + } + if (ret.size() == 0) { + return new CharFilterFactory[0]; + } + return ret.toArray(new CharFilterFactory[ret.size()]); + } + + private static TokenFilterFactory[] buildTokenFilterFactories(JsonElement el, + String analyzerName) throws IOException { + if (el == null || el.isJsonNull()) { + return null; + } + if (! el.isJsonArray()) { + throw new IllegalArgumentException( + "Expecting array for tokenfilters, but got:"+el.toString() + " in "+analyzerName); + } + JsonArray jsonArray = (JsonArray)el; + List<TokenFilterFactory> ret = new LinkedList<>(); + for (JsonElement filterMap : jsonArray) { + if (!(filterMap instanceof JsonObject)) { + throw new IllegalArgumentException("Expecting a map with \"factory\" string and \"params\" map in token filter factory;"+ + " not: "+filterMap.toString() + " in "+ analyzerName); + } + JsonElement factoryEl = ((JsonObject)filterMap).get(FACTORY); + if (factoryEl == null || ! factoryEl.isJsonPrimitive()) { + throw new IllegalArgumentException("Expecting value for factory in token filter factory builder in "+analyzerName); + } + String factoryName = factoryEl.getAsString(); + factoryName = factoryName.startsWith("oala.") ? + factoryName.replaceFirst("oala.", "org.apache.lucene.analysis.") : + factoryName; + + JsonElement paramsEl = ((JsonObject)filterMap).get(PARAMS); + Map<String, String> params = mapify(paramsEl); + String spiName = ""; + for (String s : TokenFilterFactory.availableTokenFilters()) { + Class clazz = TokenFilterFactory.lookupClass(s); + if (clazz.getName().equals(factoryName)) { + spiName = s; + break; + } + } + if (spiName.equals("")) { + throw new IllegalArgumentException("A SPI class of type org.apache.lucene.analysis.util.TokenFilterFactory with name"+ + "'"+factoryName+"' does not exist."); + } + + try { + TokenFilterFactory tokenFilterFactory = TokenFilterFactory.forName(spiName, params); + if (tokenFilterFactory instanceof ResourceLoaderAware) { + ((ResourceLoaderAware) tokenFilterFactory).inform(new ClasspathResourceLoader(AnalyzerDeserializer.class)); + } + ret.add(tokenFilterFactory); + } catch (IllegalArgumentException e) { + throw new IllegalArgumentException("While loading "+analyzerName, e); + } + } + if (ret.size() == 0) { + return new TokenFilterFactory[0]; + } + return ret.toArray(new TokenFilterFactory[ret.size()]); + } + + private static Map<String, String> mapify(JsonElement paramsEl) { + if (paramsEl == null || paramsEl.isJsonNull()) { + return Collections.EMPTY_MAP; + } + if (! paramsEl.isJsonObject()) { + throw new IllegalArgumentException("Expecting map, not: "+paramsEl.toString()); + } + Map<String, String> params = new HashMap<>(); + for (Map.Entry<String,JsonElement> e : ((JsonObject)paramsEl).entrySet()) { + JsonElement value = e.getValue(); + if (! value.isJsonPrimitive()) { + throw new IllegalArgumentException("Expecting parameter to have primitive value: "+value.toString()); + } + String v = e.getValue().getAsString(); + params.put(e.getKey(), v); + } + return params; + } + + /** + * Plagiarized verbatim from Solr! + */ + private static class MyTokenizerChain extends Analyzer { + + final private CharFilterFactory[] charFilters; + final private TokenizerFactory tokenizer; + final private TokenFilterFactory[] filters; + + public MyTokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) { + this(null, tokenizer, filters); + } + + public MyTokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) { + this.charFilters = charFilters; + this.tokenizer = tokenizer; + this.filters = filters; + } + + public CharFilterFactory[] getCharFilterFactories() { + return charFilters; + } + + public TokenizerFactory getTokenizerFactory() { + return tokenizer; + } + + public TokenFilterFactory[] getTokenFilterFactories() { + return filters; + } + + @Override + public Reader initReader(String fieldName, Reader reader) { + + if (charFilters != null && charFilters.length > 0) { + Reader cs = reader; + for (CharFilterFactory charFilter : charFilters) { + cs = charFilter.create(cs); + } + reader = cs; + } + + return reader; + } + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer tk = tokenizer.create(); + TokenStream ts = tk; + for (TokenFilterFactory filter : filters) { + ts = filter.create(ts); + } + + return new TokenStreamComponents(tk, ts); + } + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java new file mode 100644 index 0000000..db6ae26 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.tokens; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.Map; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.JsonParseException; +import org.apache.lucene.analysis.Analyzer; + +public class AnalyzerManager { + + private final static String GENERAL = "general"; + private static final String ALPHA_IDEOGRAPH = "alpha"; + private static final String COMMON_TOKENS = "common_tokens"; + + private final Analyzer generalAnalyzer; + private final Analyzer alphaIdeoAnalyzer; + private final Analyzer commonTokensAnalyzer; + + private AnalyzerManager(Analyzer generalAnalyzer, + Analyzer alphaIdeoAnalyzer, + Analyzer commonTokensAnalyzer) { + this.generalAnalyzer = generalAnalyzer; + this.alphaIdeoAnalyzer = alphaIdeoAnalyzer; + this.commonTokensAnalyzer = commonTokensAnalyzer; + } + + public static AnalyzerManager newInstance() throws IOException { + InputStream is = AnalyzerManager.class.getClassLoader().getResourceAsStream("lucene-analyzers.json"); + Reader reader = new InputStreamReader(is, "UTF-8"); + GsonBuilder builder = new GsonBuilder(); + builder.registerTypeHierarchyAdapter(Map.class, new AnalyzerDeserializer()); + Gson gson = builder.create(); + Map<String, Analyzer> map = gson.fromJson(reader, Map.class); + Analyzer general = map.get(GENERAL); + Analyzer alphaIdeo = map.get(ALPHA_IDEOGRAPH); + Analyzer common = map.get(COMMON_TOKENS); + if (general == null) { + throw new JsonParseException("Must specify "+GENERAL + " analyzer"); + } + if (alphaIdeo == null) { + throw new JsonParseException("Must specify "+ ALPHA_IDEOGRAPH + " analyzer"); + } + if (common == null) { + throw new JsonParseException("Must specify "+ COMMON_TOKENS + " analyzer"); + } + + return new AnalyzerManager(general, alphaIdeo, common); + } + + /** + * This analyzer should be used to extract all tokens. + * @return + */ + public Analyzer getGeneralAnalyzer() { + return generalAnalyzer; + } + + /** + * This analyzer is used to extract "alphabetic" tokens. + * @return + */ + public Analyzer getAlphaIdeoAnalyzer() { + return alphaIdeoAnalyzer; + } + + /** + * This analyzer should be used to generate common tokens lists from + * large corpora. It is not used by tika-eval in profiling or comparing. + * @return + */ + public Analyzer getCommonTokensAnalyzer() { + return commonTokensAnalyzer; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java new file mode 100644 index 0000000..31fa866 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java @@ -0,0 +1,74 @@ +package org.apache.tika.eval.tokens; + +import java.io.IOException; +import java.util.Map; + +import org.apache.lucene.analysis.FilteringTokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.cjk.CJKBigramFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.util.TokenFilterFactory; + +/** + * Creates a very narrowly focused TokenFilter that limits tokens based on length + * _unless_ they've been identified as <DOUBLE> or <SINGLE> + * by the CJKBigramFilter. + * + * This class is intended to be used when generating "common tokens" files. + */ +public class CJKBigramAwareLengthFilterFactory extends TokenFilterFactory { + + + + private final int min; + private final int max; + public CJKBigramAwareLengthFilterFactory(Map<String, String> args) { + super(args); + min = Integer.parseInt(args.get("min")); + max = Integer.parseInt(args.get("max")); + } + + @Override + public TokenStream create(TokenStream tokenStream) { + return new CJKAwareLengthFilter(tokenStream); + } + + private class CJKAwareLengthFilter extends FilteringTokenFilter { + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); + + public CJKAwareLengthFilter(TokenStream in) { + super(in); + } + + @Override + protected boolean accept() throws IOException { + if ( termAtt.length() < min) { + String type = typeAtt.type(); + if (type == CJKBigramFilter.DOUBLE_TYPE || type == CJKBigramFilter.SINGLE_TYPE) { + return true; + } + } + return termAtt.length() >= min && termAtt.length() <= max; + } + } + + /* + private static boolean isCJ(int codePoint) { + if ( + (codePoint >= 0x4E00 && codePoint <= 0x9FFF) || + ( codePoint >= 0x3400 && codePoint <= 0x4dbf) || + ( codePoint >= 0x20000 && codePoint <= 0x2a6df) || + ( codePoint >= 0x2A700 && codePoint <= 0x2b73f) || + ( codePoint >= 0x2B740 && codePoint <= 0x2B81F) || + ( codePoint >= 0x2B820 && codePoint <- 0x2CEAF) || + ( codePoint >= 0xF900 && codePoint <= 0xFAFF) || + ( codePoint >= 0x2F800 && codePoint <= 0x2Fa1F) + ) { + return true; + } + return false; + }*/ + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java new file mode 100644 index 0000000..b74daa1 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java @@ -0,0 +1,141 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.tokens; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; + +import org.apache.commons.lang3.mutable.MutableInt; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class CommonTokenCountManager { + + private final static Charset COMMON_TOKENS_CHARSET = StandardCharsets.UTF_8; + private final static Logger LOGGER = LoggerFactory.getLogger(CommonTokenCountManager.class); + + private final Path commonTokensDir; + + Map<String, Set<String>> commonTokenMap = new ConcurrentHashMap<>(); + Set<String> alreadyTriedToLoad = new HashSet<>(); + + //if we have no model or if no langid is passed in + //make this configurable + String defaultLangCode = "en"; + + public CommonTokenCountManager(Path commonTokensDir) throws IOException { + this.commonTokensDir = commonTokensDir; + tryToLoad(defaultLangCode); + //if you couldn't load it, make sure to add an empty + //set to prevent npes later + Set<String> set = commonTokenMap.get(defaultLangCode); + if (set == null) { + LOGGER.warn("No common tokens for default language: '"+defaultLangCode+"'"); + commonTokenMap.put(defaultLangCode, new HashSet<String>()); + } + } + + public CommonTokenResult countTokenOverlaps(String langCode, + Map<String, MutableInt> tokens) throws IOException { + String actualLangCode = getActualLangCode(langCode); + int overlap = 0; + Set<String> commonTokens = commonTokenMap.get(actualLangCode); + for (Map.Entry<String, MutableInt> e : tokens.entrySet()) { + if (commonTokens.contains(e.getKey())) { + overlap += e.getValue().intValue(); + } + } + return new CommonTokenResult(actualLangCode, overlap); + } + + //return langcode for lang that you are actually using + //lazily load the appropriate model + private String getActualLangCode(String langCode) { + if (langCode == null || "".equals(langCode)) { + return defaultLangCode; + } + if (commonTokenMap.containsKey(langCode)) { + return langCode; + } + tryToLoad(langCode); + Set<String> set = commonTokenMap.get(langCode); + if (set == null) { + return defaultLangCode; + } + return langCode; + + } + + public void close() throws IOException { + commonTokenMap.clear(); + } + + private synchronized void tryToLoad(String langCode) { + if (alreadyTriedToLoad.contains(langCode)) { + return; + } + //check once more now that we're in a + //synchronized block + if (commonTokenMap.get(langCode) != null) { + return; + } + Path p = commonTokensDir.resolve(langCode); + if (!Files.isRegularFile(p)) { + LOGGER.warn("Couldn't find common tokens file for: '"+langCode+"': "+ + p.toAbsolutePath()); + alreadyTriedToLoad.add(langCode); + return; + } + + Set<String> set = commonTokenMap.get(langCode); + if (set == null) { + set = new HashSet<>(); + commonTokenMap.put(langCode, set); + } + try (BufferedReader reader = Files.newBufferedReader(p, COMMON_TOKENS_CHARSET)) { + alreadyTriedToLoad.add(langCode); + String line = reader.readLine(); + while (line != null) { + line = line.trim(); + if (line.startsWith("#")) { + line = reader.readLine(); + continue; + } + //allow language models with, e.g. tab-delimited counts after the term + String[] cols = line.split("\t"); + String t = cols[0].trim(); + if (t.length() > 0) { + set.add(t); + } + + line = reader.readLine(); + } + } catch (IOException e) { + LOGGER.warn("IOException trying to read: '"+langCode+"'"); + } + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java new file mode 100644 index 0000000..340ceca --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.eval.tokens; + + +public class CommonTokenResult { + + private final String langCode; + private final int tokens; + + public CommonTokenResult(String langCode, int tokens) { + this.langCode = langCode; + this.tokens = tokens; + } + + public String getLangCode() { + return langCode; + } + + public int getTokens() { + return tokens; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/ContrastStatistics.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/ContrastStatistics.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/ContrastStatistics.java new file mode 100644 index 0000000..dd1e49d --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/ContrastStatistics.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.tokens; + +public class ContrastStatistics { + + + double diceCoefficient; + double overlap; + + TokenIntPair[] topNUniqueA; + TokenIntPair[] topNUniqueB; + TokenIntPair[] topNMoreA; + TokenIntPair[] topNMoreB; + + void setDiceCoefficient(double diceCoefficient) { + this.diceCoefficient = diceCoefficient; + } + + void setOverlap(double overlap) { + this.overlap = overlap; + } + + void setTopNUniqueA(TokenIntPair[] topNUniqueA) { + this.topNUniqueA = topNUniqueA; + } + + void setTopNUniqueB(TokenIntPair[] topNUniqueB) { + this.topNUniqueB = topNUniqueB; + } + + void setTopNMoreA(TokenIntPair[] topNMoreA) { + this.topNMoreA = topNMoreA; + } + + void setTopNMoreB(TokenIntPair[] topNMoreB) { + this.topNMoreB = topNMoreB; + } + + public double getDiceCoefficient() { + return diceCoefficient; + } + + public double getOverlap() { + return overlap; + } + + public TokenIntPair[] getTopNUniqueA() { + return topNUniqueA; + } + + public TokenIntPair[] getTopNUniqueB() { + return topNUniqueB; + } + + public TokenIntPair[] getTopNMoreA() { + return topNMoreA; + } + + public TokenIntPair[] getTopNMoreB() { + return topNMoreB; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenContraster.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenContraster.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenContraster.java new file mode 100644 index 0000000..6cbbbc7 --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenContraster.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.tokens; + +import java.util.Map; + +import org.apache.commons.lang3.mutable.MutableInt; +import org.apache.lucene.util.PriorityQueue; + +/** + * Computes some corpus contrast statistics. + * + * Not thread safe. + */ +public class TokenContraster { + + private TokenStatistics tokenStatisticsA; + private TokenStatistics tokenStatisticsB; + + private TokenCountPriorityQueue uniqA; + private TokenCountPriorityQueue uniqB; + + private TokenCountDiffQueue moreA; + private TokenCountDiffQueue moreB; + + + private int topN = 10; + + private int diceCoefficientNum = 0; + private int overlapNum = 0; + + private double diceCoefficient = 0.0d; + private double overlap = 0.0; + + + public ContrastStatistics calculateContrastStatistics(Map<String, MutableInt> mapA, + TokenStatistics tokenStatisticsA, + Map<String, MutableInt> mapB, + TokenStatistics tokenStatisticsB) { + reset(); + this.tokenStatisticsA = tokenStatisticsA; + this.tokenStatisticsB = tokenStatisticsB; + + for (Map.Entry<String, MutableInt> e : mapA.entrySet()) { + MutableInt bVal = mapB.get(e.getKey()); + int b = (bVal == null) ? 0 : bVal.intValue(); + add(e.getKey(), e.getValue().intValue(), b); + } + + for (Map.Entry<String, MutableInt> e : mapB.entrySet()) { + if (mapA.containsKey(e.getKey())) { + continue; + } + add(e.getKey(), 0, e.getValue().intValue()); + } + finishComputing(); + ContrastStatistics contrastStatistics = new ContrastStatistics(); + contrastStatistics.setDiceCoefficient(diceCoefficient); + contrastStatistics.setOverlap(overlap); + contrastStatistics.setTopNUniqueA(uniqA.getArray()); + contrastStatistics.setTopNUniqueB(uniqB.getArray()); + contrastStatistics.setTopNMoreA(moreA.getArray()); + contrastStatistics.setTopNMoreB(moreB.getArray()); + return contrastStatistics; + } + + private void reset() { + this.uniqA = new TokenCountPriorityQueue(topN); + this.uniqB = new TokenCountPriorityQueue(topN); + this.moreA = new TokenCountDiffQueue(topN); + this.moreB = new TokenCountDiffQueue(topN); + diceCoefficientNum = 0; + overlapNum = 0; + diceCoefficient = 0.0d; + overlap = 0.0; + + } + private void add(String token, int tokenCountA, int tokenCountB) { + if (tokenCountA > 0 && tokenCountB > 0) { + diceCoefficientNum += 2; + overlapNum += 2 * Math.min(tokenCountA, tokenCountB); + } + + + if (tokenCountA == 0L && tokenCountB > 0L) { + addToken(token, tokenCountB, uniqB); + } + if (tokenCountB == 0L && tokenCountA > 0L) { + addToken(token, tokenCountA, uniqA); + } + + if (tokenCountA > tokenCountB) { + addTokenDiff(token, tokenCountA, tokenCountA-tokenCountB, moreA); + } else if (tokenCountB > tokenCountA) { + addTokenDiff(token, tokenCountB, tokenCountB-tokenCountA, moreB); + + } + + } + + private void finishComputing() { + + long sumUniqTokens = tokenStatisticsA.getTotalUniqueTokens() + +tokenStatisticsB.getTotalUniqueTokens(); + + diceCoefficient = (double) diceCoefficientNum / (double) sumUniqTokens; + overlap = (float) overlapNum / (double) (tokenStatisticsA.getTotalTokens() + + tokenStatisticsB.getTotalTokens()); + + } + + private void addTokenDiff(String token, int tokenCount, int diff, TokenCountDiffQueue queue) { + if (queue.top() == null || queue.size() < topN || + diff >= queue.top().diff) { + queue.insertWithOverflow(new TokenCountDiff(token, diff, tokenCount)); + } + + } + + private void addToken(String token, int tokenCount, TokenCountPriorityQueue queue) { + if (queue.top() == null || queue.size() < topN || + tokenCount >= queue.top().getValue()) { + queue.insertWithOverflow(new TokenIntPair(token, tokenCount)); + } + + } + + class TokenCountDiffQueue extends PriorityQueue<TokenCountDiff> { + + TokenCountDiffQueue(int maxSize) { + super(maxSize); + } + + @Override + protected boolean lessThan(TokenCountDiff arg0, TokenCountDiff arg1) { + if (arg0.diff < arg1.diff) { + return true; + } else if (arg0.diff > arg1.diff) { + return false; + } + return arg1.token.compareTo(arg0.token) < 0; + } + + public TokenIntPair[] getArray() { + TokenIntPair[] topN = new TokenIntPair[size()]; + //now we reverse the queue + TokenCountDiff token = pop(); + int i = topN.length-1; + while (token != null && i > -1) { + topN[i--] = new TokenIntPair(token.token, token.diff); + token = pop(); + } + return topN; + } + } + + private class TokenCountDiff { + private final String token; + private final int diff; + private final int count; + + private TokenCountDiff(String token, int diff, int count) { + this.token = token; + this.diff = diff; + this.count = count; + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/aa7a0c35/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCountPriorityQueue.java ---------------------------------------------------------------------- diff --git a/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCountPriorityQueue.java b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCountPriorityQueue.java new file mode 100644 index 0000000..624da0c --- /dev/null +++ b/tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCountPriorityQueue.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.eval.tokens; + +import org.apache.lucene.util.PriorityQueue; + +public class TokenCountPriorityQueue extends PriorityQueue<TokenIntPair> { + + TokenCountPriorityQueue(int maxSize) { + super(maxSize); + } + + @Override + protected boolean lessThan(TokenIntPair arg0, TokenIntPair arg1) { + if (arg0.getValue() < arg1.getValue()) { + return true; + } else if (arg0.getValue() > arg1.getValue()) { + return false; + } + return arg1.token.compareTo(arg0.token) < 0; + } + + public TokenIntPair[] getArray() { + TokenIntPair[] topN = new TokenIntPair[size()]; + //now we reverse the queue + TokenIntPair term = pop(); + int i = topN.length-1; + while (term != null && i > -1) { + topN[i--] = term; + term = pop(); + } + return topN; + } +} \ No newline at end of file