1. use start/End document in handler; 2. populate metadata before handler is 
called. 3. make topN 2 in both REST and script configs.


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/52be4259
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/52be4259
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/52be4259

Branch: refs/heads/master
Commit: 52be42599bf406c9cc5ae16f539f2b57d4000af8
Parents: 0096dd7
Author: Chris Mattmann <[email protected]>
Authored: Sun Aug 14 11:37:32 2016 -0700
Committer: Chris Mattmann <[email protected]>
Committed: Sun Aug 14 11:37:32 2016 -0700

----------------------------------------------------------------------
 .../recognition/ObjectRecognitionParser.java    | 47 ++++++++++++++------
 .../recognition/tika-config-tflow-rest.xml      |  4 +-
 2 files changed, 35 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/52be4259/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
index a44564b..4cb1364 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
@@ -39,6 +39,7 @@ import java.io.InputStream;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
+import java.util.ArrayList;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
@@ -121,32 +122,50 @@ public class ObjectRecognitionParser extends 
AbstractParser implements Initializ
         LOG.debug("Time taken {}ms", System.currentTimeMillis() - start);
         if (objects != null && !objects.isEmpty()) {
 
-            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
-            xhtml.startElement("ol", "id", "objects");
             Collections.sort(objects, DESC_CONFIDENCE_SORTER);
-            int count = 0;
-            for (RecognisedObject object : objects) {
+           int count = 0;
+           List<RecognisedObject> acceptedObjects = new 
ArrayList<RecognisedObject>(topN);
+
+           // first process all the MD objects
+           for (RecognisedObject object: objects){
                 if (object.getConfidence() >= minConfidence) {
-                    LOG.debug("Add {}", object);
-                    count++;
-                    String mdValue = String.format(Locale.ENGLISH, "%s (%.5f)",
-                            object.getLabel(), object.getConfidence());
-                    metadata.add(MD_KEY, mdValue);
+                   if (object.getConfidence() >= minConfidence) {
+                       count++;
+                       LOG.debug("Add {}", object);
+                       String mdValue = String.format(Locale.ENGLISH, "%s 
(%.5f)",
+                                                  object.getLabel(), 
object.getConfidence());
+                       metadata.add(MD_KEY, mdValue);
+                       acceptedObjects.add(object);
+                       if (count >= topN) {
+                           break;
+                       }
+                   }
+                   else{
+                       LOG.warn("Object {} confidence {} less than min {}", 
object, object.getConfidence(), minConfidence);
+                   }
+               }
+           }
+
+           // now the handler
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+           xhtml.startDocument();
+            xhtml.startElement("ol", "id", "objects");
+            count = 0;
+            for (RecognisedObject object : acceptedObjects) {
                     //writing to handler
                     xhtml.startElement("li", "id", object.getId());
                     String text = String.format(Locale.ENGLISH, " %s 
[%s](confidence = %f )",
                             object.getLabel(), object.getLabelLang(), 
object.getConfidence());
-                    xhtml.characters(text);
+                   xhtml.characters(text);
                     xhtml.endElement("li");
-                    if (count >= topN) {
-                        break;
-                    }
-                }
             }
+
             xhtml.endElement("ol");
+           xhtml.endDocument();
         } else {
             LOG.warn("NO objects");
             metadata.add("no.objects", Boolean.TRUE.toString());
         }
+
     }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/52be4259/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
 
b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
index ad72c95..ddb42ec 100644
--- 
a/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
+++ 
b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
@@ -21,10 +21,10 @@
         <parser 
class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
             <mime>image/jpeg</mime>
             <params>
-                <param name="topN" type="int">7</param>
+                <param name="topN" type="int">2</param>
                 <param name="minConfidence" type="double">0.015</param>
                 <param name="class" 
type="string">org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser</param>
             </params>
         </parser>
     </parsers>
-</properties>
\ No newline at end of file
+</properties>

Reply via email to