1. use start/End document in handler; 2. populate metadata before handler is called. 3. make topN 2 in both REST and script configs.
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/52be4259 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/52be4259 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/52be4259 Branch: refs/heads/TIKA-1508 Commit: 52be42599bf406c9cc5ae16f539f2b57d4000af8 Parents: 0096dd7 Author: Chris Mattmann <[email protected]> Authored: Sun Aug 14 11:37:32 2016 -0700 Committer: Chris Mattmann <[email protected]> Committed: Sun Aug 14 11:37:32 2016 -0700 ---------------------------------------------------------------------- .../recognition/ObjectRecognitionParser.java | 47 ++++++++++++++------ .../recognition/tika-config-tflow-rest.xml | 4 +- 2 files changed, 35 insertions(+), 16 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/52be4259/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java index a44564b..4cb1364 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java @@ -39,6 +39,7 @@ import java.io.InputStream; import java.util.Collections; import java.util.Comparator; import java.util.List; +import java.util.ArrayList; import java.util.Locale; import java.util.Map; import java.util.Set; @@ -121,32 +122,50 @@ public class ObjectRecognitionParser extends AbstractParser implements Initializ LOG.debug("Time taken {}ms", System.currentTimeMillis() - start); if (objects != null && !objects.isEmpty()) { - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - xhtml.startElement("ol", "id", "objects"); Collections.sort(objects, DESC_CONFIDENCE_SORTER); - int count = 0; - for (RecognisedObject object : objects) { + int count = 0; + List<RecognisedObject> acceptedObjects = new ArrayList<RecognisedObject>(topN); + + // first process all the MD objects + for (RecognisedObject object: objects){ if (object.getConfidence() >= minConfidence) { - LOG.debug("Add {}", object); - count++; - String mdValue = String.format(Locale.ENGLISH, "%s (%.5f)", - object.getLabel(), object.getConfidence()); - metadata.add(MD_KEY, mdValue); + if (object.getConfidence() >= minConfidence) { + count++; + LOG.debug("Add {}", object); + String mdValue = String.format(Locale.ENGLISH, "%s (%.5f)", + object.getLabel(), object.getConfidence()); + metadata.add(MD_KEY, mdValue); + acceptedObjects.add(object); + if (count >= topN) { + break; + } + } + else{ + LOG.warn("Object {} confidence {} less than min {}", object, object.getConfidence(), minConfidence); + } + } + } + + // now the handler + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + xhtml.startElement("ol", "id", "objects"); + count = 0; + for (RecognisedObject object : acceptedObjects) { //writing to handler xhtml.startElement("li", "id", object.getId()); String text = String.format(Locale.ENGLISH, " %s [%s](confidence = %f )", object.getLabel(), object.getLabelLang(), object.getConfidence()); - xhtml.characters(text); + xhtml.characters(text); xhtml.endElement("li"); - if (count >= topN) { - break; - } - } } + xhtml.endElement("ol"); + xhtml.endDocument(); } else { LOG.warn("NO objects"); metadata.add("no.objects", Boolean.TRUE.toString()); } + } } http://git-wip-us.apache.org/repos/asf/tika/blob/52be4259/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml index ad72c95..ddb42ec 100644 --- a/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml @@ -21,10 +21,10 @@ <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser"> <mime>image/jpeg</mime> <params> - <param name="topN" type="int">7</param> + <param name="topN" type="int">2</param> <param name="minConfidence" type="double">0.015</param> <param name="class" type="string">org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser</param> </params> </parser> </parsers> -</properties> \ No newline at end of file +</properties>
