Object recognition parser, tensorflow based implementation, and test cases for 
these


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/2184e2c2
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/2184e2c2
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/2184e2c2

Branch: refs/heads/TIKA-1508
Commit: 2184e2c2c2a0e507be6be4f9692e0fab5b38a476
Parents: eccc153
Author: Thamme Gowda <[email protected]>
Authored: Sat Jun 11 19:04:49 2016 -0700
Committer: Thamme Gowda <[email protected]>
Committed: Sat Jun 11 19:04:49 2016 -0700

----------------------------------------------------------------------
 .../parser/recognition/ObjectRecogniser.java    |  73 +++++++++
 .../recognition/ObjectRecognitionParser.java    | 143 ++++++++++++++++++
 .../parser/recognition/RecognisedObject.java    |  91 +++++++++++
 .../tf/TensorflowImageRecParser.java            | 149 +++++++++++++++++++
 .../ObjectRecognitionParserTest.java            |  68 +++++++++
 .../tf/TensorflowImageRecParserTest.java        |  56 +++++++
 .../parser/recognition/tika-config-tflow.xml    |  29 ++++
 7 files changed, 609 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecogniser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecogniser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecogniser.java
new file mode 100644
index 0000000..3776c1e
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecogniser.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.recognition;
+
+import org.apache.tika.base.Configurable;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.recognition.tf.TensorflowImageRecParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+import java.util.Set;
+
+/**
+ *  This is a contract for object recognisers used by {@link 
ObjectRecognitionParser}
+ *  @see {@link TensorflowImageRecParser} for an example
+ */
+public interface ObjectRecogniser  extends Configurable {
+
+    /**
+     * The mimes supported by this recogniser
+     * @return set of mediatypes
+     */
+    Set<MediaType> getSupportedMimes();
+
+    /**
+     * Is this service available
+     * @return {@code true} when the service is available, {@code false} 
otherwise
+     */
+    boolean isAvailable();
+
+    /**
+     * This is the hook for configuring the recogniser
+     * @param context configuration instance in the form of context
+     * @throws TikaConfigException when there is an issue with configuration
+     */
+    void configure(ParseContext context) throws TikaConfigException;
+
+    /**
+     * Recognise the objects in the stream
+     * @param stream content stream
+     * @param handler tika's content handle
+     * @param metadata metadata instance
+     * @param context parser context
+     * @return List of {@link RecognisedObject}s
+     * @throws IOException when an I/O error occurs
+     * @throws SAXException when an issue with XML occurs
+     * @throws TikaException any generic error
+     */
+    List<RecognisedObject> recognise(InputStream stream, ContentHandler 
handler,
+                                     Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException;
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
new file mode 100644
index 0000000..274f884
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.recognition;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.recognition.tf.TensorflowImageRecParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.AnnotationUtils;
+import org.apache.tika.utils.ServiceLoaderUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Set;
+
+
+/**
+ * This parser recognises objects from Images.
+ * The Object Recognition implementation can be switched using 'class' 
argument.
+ *
+ * <b>Example Usage : </b>
+ * <pre>
+ * &lt;properties&gt;
+ *  &lt;parsers&gt;
+ *   &lt;parser 
class=&quot;org.apache.tika.parser.recognition.ObjectRecognitionParser&quot;&gt;
+ *    &lt;mime&gt;image/jpeg&lt;/mime&gt;
+ *    &lt;params&gt;
+ *      &lt;param name=&quot;topN&quot; type=&quot;int&quot;&gt;2&lt;/param&gt;
+ *      &lt;param name=&quot;minConfidence&quot; 
type=&quot;double&quot;&gt;0.015&lt;/param&gt;
+ *      &lt;param name=&quot;class&quot; 
type=&quot;string&quot;&gt;org.apache.tika.parser.recognition.tf.TensorflowImageRecParser&lt;/param&gt;
+ *    &lt;/params&gt;
+ *   &lt;/parser&gt;
+ *  &lt;/parsers&gt;
+ * &lt;/properties&gt;
+ * </pre>
+ *
+ * @since Apache Tika 1.14
+ */
+public class ObjectRecognitionParser extends AbstractParser {
+
+    public static final Logger LOG = 
LoggerFactory.getLogger(ObjectRecognitionParser.class);
+    public static final String MD_KEY = "OBJECT";
+    private static final Comparator<RecognisedObject> DESC_CONFIDENCE_SORTER =
+            new Comparator<RecognisedObject>() {
+                @Override
+                public int compare(RecognisedObject o1, RecognisedObject o2) {
+                    return Double.compare(o2.getConfidence(), 
o1.getConfidence());
+                }
+            };
+
+    @Field private double minConfidence = 0.05;
+
+    @Field private int topN = 2;
+
+    private ObjectRecogniser recogniser = new TensorflowImageRecParser();
+
+    @Field(name = "class")
+    public void setRecogniser(String recogniserClass) {
+        this.recogniser = ServiceLoaderUtils.newInstance(recogniserClass);
+    }
+
+    @Override
+    public void configure(ParseContext context) throws TikaConfigException {
+        super.configure(context);
+        AnnotationUtils.assignFieldParams(recogniser, context.getParams());
+        recogniser.configure(context);
+        LOG.info("minConfidence = {}, topN={}", minConfidence, topN);
+        LOG.info("Recogniser = {}", recogniser.getClass().getName());
+        LOG.info("Recogniser Available = {}", recogniser.isAvailable());
+    }
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return recogniser.isAvailable() ? recogniser.getSupportedMimes() : 
Collections.<MediaType>emptySet();
+    }
+
+    @Override
+    public synchronized void parse(InputStream stream, ContentHandler handler, 
Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        if (!recogniser.isAvailable()) {
+            LOG.warn("{} is not available for service", recogniser.getClass());
+            return;
+        }
+        metadata.set("object.rec.impl", recogniser.getClass().getSimpleName());
+        long start = System.currentTimeMillis();
+        List<RecognisedObject> objects = recogniser.recognise(stream, handler, 
metadata, context);
+        LOG.debug("Found {} objects", objects != null ? objects.size() : 0);
+        LOG.debug("Time taken {}ms", System.currentTimeMillis() - start);
+        if (objects != null && !objects.isEmpty()){
+
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+            xhtml.startElement("ol", "id", "objects");
+            Collections.sort(objects, DESC_CONFIDENCE_SORTER);
+            int count = 0;
+            for (RecognisedObject object : objects) {
+                if (object.getConfidence() >= minConfidence) {
+                    LOG.debug("Add {}", object);
+                    count++;
+                    metadata.add(MD_KEY, object.getLabel());
+                    //writing to handler
+                    xhtml.startElement("li", "id", object.getId());
+                    String text = String.format(" %s [%s](confidence = %f )",
+                            object.getLabel(), object.getLabelLang(), 
object.getConfidence());
+                    xhtml.characters(text);
+                    xhtml.endElement("li");
+                    if (count >= topN) {
+                        break;
+                    }
+                }
+            }
+            xhtml.endElement("ol");
+        } else {
+            LOG.warn("NO objects");
+            metadata.add("no.objects", Boolean.TRUE.toString());
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/main/java/org/apache/tika/parser/recognition/RecognisedObject.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/RecognisedObject.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/RecognisedObject.java
new file mode 100644
index 0000000..d0317c8
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/RecognisedObject.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.recognition;
+
+/**
+ * A model for recognised objects from graphics and texts typically includes
+ * human readable label for the object, language of the label, id and 
confidence score.
+ *
+ * @since Apache Tika 1.14
+ */
+public class RecognisedObject {
+
+    /**
+     * Label of this object. Usually the name given to this object by humans
+     */
+    private String label;
+    /**
+     * Language of label, Example : english
+     */
+    private String labelLang;
+    /**
+     * Identifier for this object
+     */
+    private String id;
+    /**
+     * Confidence score
+     */
+    private double confidence;
+
+    public RecognisedObject(String label, String labelLang, String id, double 
confidence) {
+        this.label = label;
+        this.labelLang = labelLang;
+        this.id = id;
+        this.confidence = confidence;
+    }
+
+    public String getLabel() {
+        return label;
+    }
+
+    public void setLabel(String label) {
+        this.label = label;
+    }
+
+    public String getLabelLang() {
+        return labelLang;
+    }
+
+    public void setLabelLang(String labelLang) {
+        this.labelLang = labelLang;
+    }
+
+    public String getId() {
+        return id;
+    }
+
+    public void setId(String id) {
+        this.id = id;
+    }
+
+    public double getConfidence() {
+        return confidence;
+    }
+
+    public void setConfidence(double confidence) {
+        this.confidence = confidence;
+    }
+
+    @Override
+    public String toString() {
+        return "RecognisedObject{" +
+                "label='" + label + "\' (" + labelLang + ')' +
+                ", id='" + id + '\'' +
+                ", confidence=" + confidence +
+                '}';
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java
new file mode 100644
index 0000000..eb1d536
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java
@@ -0,0 +1,149 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.recognition.tf;
+
+import org.apache.http.HttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.tika.config.Field;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.recognition.ObjectRecogniser;
+import org.apache.tika.parser.recognition.RecognisedObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.*;
+import java.net.URI;
+import java.util.*;
+import java.util.regex.Pattern;
+
+/**
+ * This is an implementation of {@link ObjectRecogniser} powered by <a 
href="http://www.tensorflow.org";> Tensorflow <a/>
+ *  convolutional neural network (CNN). This implementation binds to Python 
API using {@link ExternalParser}.
+ * <br/>
+ * // NOTE: This is a proof of concept for an efficient implementation using 
JNI binding to Tensorflow's C++ api.
+ *
+ * <br/>
+ *  <p>
+ *      <b>Environment Setup:</b>
+ *      <ol>
+ *          <li> Python must be available </li>
+ *          <li> Tensorflow must be available for import by the python script. 
<a 
href="https://www.tensorflow.org/versions/r0.9/get_started/os_setup.html#pip-installation";>
 Setup Instructions here </a></li>
+ *          <li> All dependencies of tensor flow (such as numpy) must also be 
available. <a 
href="https://www.tensorflow.org/versions/r0.9/tutorials/image_recognition/index.html#image-recognition";>Follow
 the image recognition guide and make sure it works</a></li>
+ *      </ol>
+ *  </p>
+ * @since Apache Tika 1.14
+ */
+public class TensorflowImageRecParser extends ExternalParser implements 
ObjectRecogniser {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(TensorflowImageRecParser.class);
+
+    private static final URI defaultScriptUri = 
URI.create("https://raw.githubusercontent.com/tensorflow/tensorflow/122cdce33e3e0a01a7f82645617317530aa571fb/tensorflow/models/image/imagenet/classify_image.py";);
+    private static final Set<MediaType> supportedMimes = 
Collections.singleton(MediaType.image("jpeg"));
+    private static final File defaultScriptFile = new 
File("tensorflow/tf-objectrec.py");
+    private static final File defaultModelFile = new 
File("tensorflow/tf-objectrec-model");
+    private static final LineConsumer ignoredLineLogger = new LineConsumer() {
+        @Override
+        public void consume(String line) {
+            LOG.debug(line);
+        }
+    };
+
+    @Field private URI scriptUri = defaultScriptUri;
+    @Field private String executor = "python";
+    @Field private File scriptFile = defaultScriptFile;
+    @Field private String modelArg = "--model_dir";
+    @Field private File modelFile = defaultModelFile;
+    @Field private String imageArg = "--image_file";
+    @Field private String outPattern = "(.*) \\(score = ([0-9]+\\.[0-9]+)\\)$";
+    @Field private String availabilityTestArgs = ""; //when no args are given, 
the script will test itself!
+
+    private boolean available = false;
+
+    public Set<MediaType> getSupportedMimes() {
+        return supportedMimes;
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return available;
+    }
+
+    @Override
+    public void configure(ParseContext context) throws TikaConfigException {
+        super.configure(context);
+        try {
+            if (!modelFile.exists()) {
+                modelFile.getParentFile().mkdirs();
+                LOG.warn("Model doesn't exist at {}. Expecting the script to 
download it.", modelFile);
+            }
+            if (!scriptFile.exists()) {
+                scriptFile.getParentFile().mkdirs();
+                LOG.info("GET : {} -> {}", scriptUri, scriptFile);
+                DefaultHttpClient httpClient = new DefaultHttpClient();
+                HttpGet getMethod = new HttpGet(scriptUri);
+                try (BufferedOutputStream stream = new 
BufferedOutputStream(new FileOutputStream(scriptFile))) {
+                    HttpResponse response = httpClient.execute(getMethod);
+                    IOUtils.copy(response.getEntity().getContent(), stream);
+                }
+                LOG.debug("Downloaded..");
+            }
+            String[] availabilityCheckArgs = {executor, 
scriptFile.getAbsolutePath(),
+                    modelArg, modelFile.getAbsolutePath(), 
availabilityTestArgs};
+            available = ExternalParser.check(availabilityCheckArgs);
+            LOG.debug("Available? {}", available);
+            if (!available) {
+                return;
+            }
+            String[] parseCmd = {
+                    executor, scriptFile.getAbsolutePath(),
+                    modelArg, modelFile.getAbsolutePath(),
+                    imageArg, INPUT_FILE_TOKEN,
+                    "--out_file", OUTPUT_FILE_TOKEN}; //inserting output token 
to let external parser parse metadata
+            setCommand(parseCmd);
+            HashMap<Pattern, String> patterns = new HashMap<>();
+            patterns.put(Pattern.compile(outPattern), null);
+            setMetadataExtractionPatterns(patterns);
+            setIgnoredLineConsumer(ignoredLineLogger);
+        } catch (Exception e) {
+            throw new TikaConfigException(e.getMessage(), e);
+        }
+    }
+
+    @Override
+    public List<RecognisedObject> recognise(InputStream stream, ContentHandler 
handler,
+                                            Metadata metadata, ParseContext 
context)
+            throws IOException, SAXException, TikaException {
+        Metadata md = new Metadata();
+        parse(stream, handler, md, context);
+        List<RecognisedObject> objects = new ArrayList<>();
+        for (String key: md.names()) {
+            double confidence = Double.parseDouble(md.get(key));
+            objects.add(new RecognisedObject(key, "eng", key, confidence));
+        }
+        return objects;
+    }
+}
+

http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
new file mode 100644
index 0000000..fc96b1d
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.recognition;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+
+/**
+ * Testcases for Object Recognition Parser
+ */
+public class ObjectRecognitionParserTest {
+
+    private static final String CONFIG_FILE = 
"org/apache/tika/parser/recognition/tika-config-tflow.xml";
+    private static final String CAT_IMAGE = "test-documents/testJPEG.jpg";
+    private static final ClassLoader loader = 
ObjectRecognitionParserTest.class.getClassLoader();
+
+   @Ignore("If tensorflow not available Ignore") @Test
+    public void jpegTesorflowTest() throws IOException, TikaException, 
SAXException {
+
+        try(InputStream stream = loader.getResourceAsStream(CONFIG_FILE)){
+            assert stream != null;
+            Tika tika = new Tika(new TikaConfig(stream));
+            Metadata metadata = new Metadata();
+            try (InputStream imageStream = 
loader.getResourceAsStream(CAT_IMAGE)){
+                Reader reader = tika.parse(imageStream, metadata);
+                List<String> lines = IOUtils.readLines(reader);
+                String text = StringUtils.join(lines, " ");
+                String[] expectedObjects = {"Egyptian cat", "Border collie"};
+                HashSet<String> objects = new HashSet<>();
+                
objects.addAll(Arrays.asList(metadata.getValues(ObjectRecognitionParser.MD_KEY)));
+                for (String expectedObject : expectedObjects) {
+                    String message = "'" + expectedObject + "' must have been 
detected";
+                    Assert.assertTrue(message, text.contains(expectedObject));
+                    Assert.assertTrue(message, 
objects.contains(expectedObject));
+                }
+            }
+        }
+    }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParserTest.java
new file mode 100644
index 0000000..038f2d3
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParserTest.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.recognition.tf;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.recognition.RecognisedObject;
+import org.junit.Assert;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
+
+import java.io.InputStream;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+
+@Ignore
+public class TensorflowImageRecParserTest {
+
+    @Test
+    public void recognise() throws Exception {
+        TensorflowImageRecParser recogniser = new TensorflowImageRecParser();
+        recogniser.configure(new ParseContext());
+        try (InputStream stream = 
getClass().getClassLoader().getResourceAsStream("test-documents/testJPEG.jpg")) 
{
+            List<RecognisedObject> objects = recogniser.recognise(stream, new 
DefaultHandler(), new Metadata(), new ParseContext());
+            Assert.assertTrue(5 == objects.size());
+            Set<String> objectLabels = new HashSet<>();
+            for (RecognisedObject object : objects) {
+                objectLabels.add(object.getLabel());
+            }
+            System.out.println(objectLabels);
+            String[] expected = {"English foxhound", "Egyptian cat", "collie", 
"Border collie"};
+            for (String label : expected) {
+                Assert.assertTrue(label + " is expected", 
objectLabels.contains(label));
+            }
+        }
+    }
+
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow.xml
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow.xml
 
b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow.xml
new file mode 100644
index 0000000..f848d15
--- /dev/null
+++ 
b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser 
class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
+            <mime>image/jpeg</mime>
+            <params>
+                <param name="topN" type="int">2</param>
+                <param name="minConfidence" type="double">0.015</param>
+                <param name="class" 
type="string">org.apache.tika.parser.recognition.tf.TensorflowImageRecParser</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
\ No newline at end of file

Reply via email to