This is an automated email from the ASF dual-hosted git repository.

nick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new c05c524  Serialise the details of multiple parsers
c05c524 is described below

commit c05c5245bdea38ddb9ef24e99ddcdc7bf7803d67
Author: Nick Burch <n...@gagravarr.org>
AuthorDate: Sun Apr 8 13:56:38 2018 +0100

    Serialise the details of multiple parsers
---
 .../apache/tika/config/TikaConfigSerializer.java   | 14 ++++++++++
 .../parser/multiple/AbstractMultipleParser.java    |  4 +++
 .../multiple/PickBestTextEncodingParser.java       | 30 ++++++++++++++++------
 3 files changed, 40 insertions(+), 8 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java 
b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
index c67b03b..dda1675 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
@@ -45,6 +45,7 @@ import org.apache.tika.parser.DefaultParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.multiple.AbstractMultipleParser;
 import org.apache.tika.utils.XMLReaderUtils;
 import org.w3c.dom.Document;
 import org.w3c.dom.Element;
@@ -254,6 +255,9 @@ public class TikaConfigSerializer {
                     (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) {
                 outputParser = false;
             }
+        } else if (parser instanceof AbstractMultipleParser) {
+            // Always output the parsers that go into the multiple
+            children = ((AbstractMultipleParser)parser).getAllParsers();
         }
 
         if (outputParser) {
@@ -290,6 +294,16 @@ public class TikaConfigSerializer {
         parserElement.setAttribute("class", className);
         rootElement.appendChild(parserElement);
 
+        // TODO Output configurable parameters in a genric way, see TIKA-1508
+        if (parser instanceof AbstractMultipleParser) {
+            Element paramsElement = doc.createElement("params");
+            Element paramElement = doc.createElement("param");
+            paramElement.setAttribute("name", "metadataPolicy");
+            paramElement.setAttribute("value", 
((AbstractMultipleParser)parser).getMetadataPolicy().toString());
+            paramsElement.appendChild(paramElement);
+            parserElement.appendChild(paramsElement);
+        }
+        
         for (MediaType type : addedTypes) {
             Element mimeElement = doc.createElement("mime");
             mimeElement.appendChild(doc.createTextNode(type.toString()));
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
 
b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index d687e41..1a58f89 100644
--- 
a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ 
b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -25,6 +25,7 @@ import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
@@ -173,6 +174,9 @@ public abstract class AbstractMultipleParser extends 
AbstractParser {
     public MetadataPolicy getMetadataPolicy() {
         return policy;
     }
+    public List<Parser> getAllParsers() {
+        return Collections.unmodifiableList(new ArrayList<>(parsers));
+    }
     
     /**
      * Used to allow implementations to prepare or change things
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
similarity index 87%
rename from 
tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
rename to 
tika-parsers/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
index f043a5a..b1a0caa 100644
--- 
a/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
@@ -31,9 +31,9 @@ import org.apache.tika.detect.NonDetectingEncodingDetector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaTypeRegistry;
-import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.txt.TXTParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.ContentHandlerFactory;
 import org.xml.sax.ContentHandler;
@@ -49,8 +49,8 @@ import org.xml.sax.SAXException;
  * This is not recommended for actual production use... It is mostly to
  *  prove that the {@link AbstractMultipleParser} environment is
  *  sufficient to support this use-case
- *  
- * TODO Move this to the parsers package so it can get {@link TXTParser}
+ *
+ * TODO Implement proper "Junk" detection
  *
  * @deprecated Currently not suitable for real use, more a demo / prototype!
  */
@@ -66,7 +66,6 @@ public class PickBestTextEncodingParser extends 
AbstractMultipleParser {
     private String[] charsetsToTry;
     
     public PickBestTextEncodingParser(MediaTypeRegistry registry, String[] 
charsets) {
-        // TODO Actually give 1 more TXTParser than we have charsets
         super(registry, MetadataPolicy.DISCARD_ALL, makeParsers(charsets));
         this.charsetsToTry = charsets;
     }
@@ -74,8 +73,7 @@ public class PickBestTextEncodingParser extends 
AbstractMultipleParser {
         // One more TXTParser than we have charsets, for the real thing
         List<Parser> parsers = new ArrayList<>(charsets.length+1);
         for (int i=0; i<charsets.length+1; i++) {
-            // TODO Actually get the right parser, TXTParser
-            parsers.set(i, new EmptyParser());
+            parsers.set(i, new TXTParser());
         }
         return parsers;
     }
@@ -104,9 +102,25 @@ public class PickBestTextEncodingParser extends 
AbstractMultipleParser {
             charsetTester.charsetText.put(charset, handler.toString());
             
             // If this was the last real charset, see which one is best
+            // TODO Do this in a more generic, less english-only way!
             if (! charsetTester.moreToTest()) {
-                // TODO Properly work out the best!
-                charsetTester.pickedCharset = charsetsToTry[0];
+                int numEnglish = 0;
+                String bestcharset = null;
+                for (String pcharset : charsetTester.charsetText.keySet()) {
+                    String text = charsetTester.charsetText.get(pcharset);
+                    int cEnglish = 0;
+                    for (char c : text.toCharArray()) {
+                       if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
+                           (c >= '0' && c <= '9')) {
+                           cEnglish++;
+                       }
+                    }
+                    if (cEnglish > numEnglish) {
+                        numEnglish = cEnglish;
+                        bestcharset = pcharset;
+                    }
+                }
+                charsetTester.pickedCharset = bestcharset;
             }
         }
         

-- 
To stop receiving notification emails like this one, please contact
n...@apache.org.

Reply via email to