This is an automated email from the ASF dual-hosted git repository. nick pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push: new c05c524 Serialise the details of multiple parsers c05c524 is described below commit c05c5245bdea38ddb9ef24e99ddcdc7bf7803d67 Author: Nick Burch <n...@gagravarr.org> AuthorDate: Sun Apr 8 13:56:38 2018 +0100 Serialise the details of multiple parsers --- .../apache/tika/config/TikaConfigSerializer.java | 14 ++++++++++ .../parser/multiple/AbstractMultipleParser.java | 4 +++ .../multiple/PickBestTextEncodingParser.java | 30 ++++++++++++++++------ 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java index c67b03b..dda1675 100644 --- a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java +++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java @@ -45,6 +45,7 @@ import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; +import org.apache.tika.parser.multiple.AbstractMultipleParser; import org.apache.tika.utils.XMLReaderUtils; import org.w3c.dom.Document; import org.w3c.dom.Element; @@ -254,6 +255,9 @@ public class TikaConfigSerializer { (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) { outputParser = false; } + } else if (parser instanceof AbstractMultipleParser) { + // Always output the parsers that go into the multiple + children = ((AbstractMultipleParser)parser).getAllParsers(); } if (outputParser) { @@ -290,6 +294,16 @@ public class TikaConfigSerializer { parserElement.setAttribute("class", className); rootElement.appendChild(parserElement); + // TODO Output configurable parameters in a genric way, see TIKA-1508 + if (parser instanceof AbstractMultipleParser) { + Element paramsElement = doc.createElement("params"); + Element paramElement = doc.createElement("param"); + paramElement.setAttribute("name", "metadataPolicy"); + paramElement.setAttribute("value", ((AbstractMultipleParser)parser).getMetadataPolicy().toString()); + paramsElement.appendChild(paramElement); + parserElement.appendChild(paramsElement); + } + for (MediaType type : addedTypes) { Element mimeElement = doc.createElement("mime"); mimeElement.appendChild(doc.createTextNode(type.toString())); diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java index d687e41..1a58f89 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java @@ -25,6 +25,7 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -173,6 +174,9 @@ public abstract class AbstractMultipleParser extends AbstractParser { public MetadataPolicy getMetadataPolicy() { return policy; } + public List<Parser> getAllParsers() { + return Collections.unmodifiableList(new ArrayList<>(parsers)); + } /** * Used to allow implementations to prepare or change things diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java similarity index 87% rename from tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java rename to tika-parsers/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java index f043a5a..b1a0caa 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java @@ -31,9 +31,9 @@ import org.apache.tika.detect.NonDetectingEncodingDetector; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaTypeRegistry; -import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.txt.TXTParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; import org.xml.sax.ContentHandler; @@ -49,8 +49,8 @@ import org.xml.sax.SAXException; * This is not recommended for actual production use... It is mostly to * prove that the {@link AbstractMultipleParser} environment is * sufficient to support this use-case - * - * TODO Move this to the parsers package so it can get {@link TXTParser} + * + * TODO Implement proper "Junk" detection * * @deprecated Currently not suitable for real use, more a demo / prototype! */ @@ -66,7 +66,6 @@ public class PickBestTextEncodingParser extends AbstractMultipleParser { private String[] charsetsToTry; public PickBestTextEncodingParser(MediaTypeRegistry registry, String[] charsets) { - // TODO Actually give 1 more TXTParser than we have charsets super(registry, MetadataPolicy.DISCARD_ALL, makeParsers(charsets)); this.charsetsToTry = charsets; } @@ -74,8 +73,7 @@ public class PickBestTextEncodingParser extends AbstractMultipleParser { // One more TXTParser than we have charsets, for the real thing List<Parser> parsers = new ArrayList<>(charsets.length+1); for (int i=0; i<charsets.length+1; i++) { - // TODO Actually get the right parser, TXTParser - parsers.set(i, new EmptyParser()); + parsers.set(i, new TXTParser()); } return parsers; } @@ -104,9 +102,25 @@ public class PickBestTextEncodingParser extends AbstractMultipleParser { charsetTester.charsetText.put(charset, handler.toString()); // If this was the last real charset, see which one is best + // TODO Do this in a more generic, less english-only way! if (! charsetTester.moreToTest()) { - // TODO Properly work out the best! - charsetTester.pickedCharset = charsetsToTry[0]; + int numEnglish = 0; + String bestcharset = null; + for (String pcharset : charsetTester.charsetText.keySet()) { + String text = charsetTester.charsetText.get(pcharset); + int cEnglish = 0; + for (char c : text.toCharArray()) { + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9')) { + cEnglish++; + } + } + if (cEnglish > numEnglish) { + numEnglish = cEnglish; + bestcharset = pcharset; + } + } + charsetTester.pickedCharset = bestcharset; } } -- To stop receiving notification emails like this one, please contact n...@apache.org.