TIKA-1993 -- High throughput Tensorflow  Inception based image classifier via: 
(1) GRPC and (2) REST API

- Added REST API service python program to resources
- Added Docker Build File for REST API service
- Added few Test Cases


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/9e0a87e6
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/9e0a87e6
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/9e0a87e6

Branch: refs/heads/TIKA-1508
Commit: 9e0a87e68a0a6b45a2f740a58b0db99d9eeecb4a
Parents: 31cf12d
Author: Thamme Gowda <[email protected]>
Authored: Mon Jul 25 17:18:33 2016 -0400
Committer: Thamme Gowda <[email protected]>
Committed: Mon Jul 25 17:18:33 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/config/TikaConfig.java |   4 +-
 .../org/apache/tika/utils/AnnotationUtils.java  |   2 +
 .../apache/tika/utils/ServiceLoaderUtils.java   |   2 +-
 .../recognition/ObjectRecognitionParser.java    |  16 +-
 .../tf/TensorflowGrpcRecogniser.java            | 148 +++++++++
 .../tf/TensorflowImageRecParser.java            |   3 +-
 .../tf/TensorflowRESTRecogniser.java            | 139 ++++++++
 .../recognition/tf/InceptionRestDockerfile      |  39 +++
 .../tika/parser/recognition/tf/inceptionapi.py  | 319 +++++++++++++++++++
 .../ObjectRecognitionParserTest.java            |  52 ++-
 .../recognition/tika-config-tflow-addon.xml     |  30 ++
 .../recognition/tika-config-tflow-rest.xml      |  30 ++
 12 files changed, 768 insertions(+), 16 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java 
b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index a680916..49bf773 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -564,12 +564,10 @@ public class TikaConfig {
 
                 Map<String, Param> params = getParams(element);
                 //Assigning the params to bean fields/setters
+                AnnotationUtils.assignFieldParams(loaded, params);
                 if (loaded instanceof Initializable) {
                     ((Initializable) loaded).initialize(params);
-                } else {
-                    AnnotationUtils.assignFieldParams(loaded, params);
                 }
-
                 // Have any decoration performed, eg explicit mimetypes
                 loaded = decorate(loaded, element);
                 // All done with setup

http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
index 9c85e60..eead096 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
@@ -134,6 +134,7 @@ public class AnnotationUtils {
                 //LOG.debug("Param not supplied, field is not mandatory");
             }
         }
+        /*
         //now test that params doesn't contain a field
         //not allowed by this object
         for (String fieldName : params.keySet()) {
@@ -144,5 +145,6 @@ public class AnnotationUtils {
                 throw new TikaConfigException(msg);
             }
         }
+        */
     }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java
index a1ccacb..0887e0d 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java
@@ -67,7 +67,7 @@ public class ServiceLoaderUtils {
      */
     public static <T> T newInstance(String className, ClassLoader loader){
         try {
-            Class loadedClass = loader.loadClass(className);
+            Class loadedClass = Class.forName(className, true, loader);
             Class<T> castedClass = loadedClass;
             T instance = castedClass.newInstance();
             return instance;

http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
index c3262f9..a44564b 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java
@@ -25,7 +25,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.recognition.tf.TensorflowImageRecParser;
+import org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.AnnotationUtils;
 import org.apache.tika.utils.ServiceLoaderUtils;
@@ -47,7 +47,7 @@ import java.util.Set;
 /**
  * This parser recognises objects from Images.
  * The Object Recognition implementation can be switched using 'class' 
argument.
- *
+ * <p>
  * <b>Example Usage : </b>
  * <pre>
  * &lt;properties&gt;
@@ -57,7 +57,7 @@ import java.util.Set;
  *    &lt;params&gt;
  *      &lt;param name=&quot;topN&quot; type=&quot;int&quot;&gt;2&lt;/param&gt;
  *      &lt;param name=&quot;minConfidence&quot; 
type=&quot;double&quot;&gt;0.015&lt;/param&gt;
- *      &lt;param name=&quot;class&quot; 
type=&quot;string&quot;&gt;org.apache.tika.parser.recognition.tf.TensorflowImageRecParser&lt;/param&gt;
+ *      &lt;param name=&quot;class&quot; 
type=&quot;string&quot;&gt;org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser&lt;/param&gt;
  *    &lt;/params&gt;
  *   &lt;/parser&gt;
  *  &lt;/parsers&gt;
@@ -80,11 +80,13 @@ public class ObjectRecognitionParser extends AbstractParser 
implements Initializ
                 }
             };
 
-    @Field private double minConfidence = 0.05;
+    @Field
+    private double minConfidence = 0.05;
 
-    @Field private int topN = 2;
+    @Field
+    private int topN = 2;
 
-    private ObjectRecogniser recogniser = new TensorflowImageRecParser();
+    private ObjectRecogniser recogniser = new TensorflowRESTRecogniser();
 
     @Field(name = "class")
     public void setRecogniser(String recogniserClass) {
@@ -117,7 +119,7 @@ public class ObjectRecognitionParser extends AbstractParser 
implements Initializ
         List<RecognisedObject> objects = recogniser.recognise(stream, handler, 
metadata, context);
         LOG.debug("Found {} objects", objects != null ? objects.size() : 0);
         LOG.debug("Time taken {}ms", System.currentTimeMillis() - start);
-        if (objects != null && !objects.isEmpty()){
+        if (objects != null && !objects.isEmpty()) {
 
             XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
             xhtml.startElement("ol", "id", "objects");

http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowGrpcRecogniser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowGrpcRecogniser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowGrpcRecogniser.java
new file mode 100644
index 0000000..4a45587
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowGrpcRecogniser.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.recognition.tf;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.recognition.ObjectRecogniser;
+import org.apache.tika.parser.recognition.RecognisedObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.Closeable;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.net.URL;
+import java.net.URLClassLoader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Tensor Flow image recogniser which has high performance.
+ * This implementation takes addon jar and binds it using reflections without
+ * without corrupting classpath with incompatible version of dependencies.
+ * <p>
+ * The addon jar can be built from 
https://github.com/thammegowda/tensorflow-grpc-java
+ *
+ * @since Apache Tika 1.14
+ */
+public class TensorflowGrpcRecogniser implements ObjectRecogniser, Closeable {
+
+    private static final Logger LOG = 
LoggerFactory.getLogger(TensorflowGrpcRecogniser.class);
+    private static final String LABEL_LANG = "en";
+    private static ClassLoader PARENT_CL = 
TensorflowGrpcRecogniser.class.getClassLoader();
+
+    static {
+        while (PARENT_CL.getParent() != null) {
+            PARENT_CL = PARENT_CL.getParent(); //move up the heighrarchy until 
we get the JDK classloader
+        }
+    }
+
+    @Field
+    private String recogniserClass = 
"edu.usc.irds.tensorflow.grpc.TensorflowObjectRecogniser";
+
+    @Field
+    private String host = "localhost";
+
+    @Field
+    private int port = 9000;
+
+    @Field(name = "addon", required = true)
+    private File addon;
+
+    private boolean available;
+
+    private Object instance;
+    private Method recogniseMethod;
+    private Method closeMethod;
+
+    @Override
+    public Set<MediaType> getSupportedMimes() {
+        return TensorflowImageRecParser.SUPPORTED_MIMES;
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return available;
+    }
+
+    @Override
+    public void initialize(Map<String, Param> params) throws 
TikaConfigException {
+        try {
+            if (!addon.exists()) {
+                throw new TikaConfigException("File " + addon + " doesnt 
exists");
+            }
+            URL[] urls = {addon.getAbsoluteFile().toURI().toURL()};
+            URLClassLoader loader = new URLClassLoader(urls, PARENT_CL);
+            Class<?> clazz = Class.forName(recogniserClass, true, loader);
+            instance = clazz.getConstructor(String.class, int.class)
+                    .newInstance(host, port);
+            recogniseMethod = clazz.getMethod("recognise", InputStream.class);
+            closeMethod = clazz.getMethod("close");
+            available = true;
+        } catch (Exception e) {
+            throw new TikaConfigException(e.getMessage(), e);
+        }
+    }
+
+    @Override
+    public List<RecognisedObject> recognise(InputStream stream,
+                                            ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        List<RecognisedObject> recObjs = new ArrayList<>();
+        try {
+            Object result = recogniseMethod.invoke(instance, stream);
+            if (result != null) {
+                List<Map.Entry<String, Double>> objects = 
(List<Map.Entry<String, Double>>) result;
+                for (Map.Entry<String, Double> object : objects) {
+                    RecognisedObject recObj = new 
RecognisedObject(object.getKey(),
+                            LABEL_LANG, object.getKey(), object.getValue());
+                    recObjs.add(recObj);
+                }
+            } else {
+                LOG.warn("Result is null");
+            }
+        } catch (IllegalAccessException | InvocationTargetException e) {
+            LOG.debug(e.getMessage(), e);
+        }
+        return recObjs;
+    }
+
+    @Override
+    public void close() throws IOException {
+        if (closeMethod != null) {
+            try {
+                closeMethod.invoke(instance);
+            } catch (IllegalAccessException | InvocationTargetException e) {
+                LOG.debug(e.getMessage(), e);
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java
index c47a105..7ed8ccb 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java
@@ -60,13 +60,14 @@ import java.util.regex.Pattern;
  *          <li> All dependencies of tensor flow (such as numpy) must also be 
available. <a 
href="https://www.tensorflow.org/versions/r0.9/tutorials/image_recognition/index.html#image-recognition";>Follow
 the image recognition guide and make sure it works</a></li>
  *      </ol>
  *  </p>
+ *  @see TensorflowGrpcRecogniser
  * @since Apache Tika 1.14
  */
 public class TensorflowImageRecParser extends ExternalParser implements 
ObjectRecogniser {
 
     private static final Logger LOG = 
LoggerFactory.getLogger(TensorflowImageRecParser.class);
     private static final String SCRIPT_FILE_NAME = "classify_image.py";
-    private static final Set<MediaType> SUPPORTED_MIMES = 
Collections.singleton(MediaType.image("jpeg"));
+    public static final Set<MediaType> SUPPORTED_MIMES = 
Collections.singleton(MediaType.image("jpeg"));
     private static final File DEFAULT_SCRIPT_FILE = new File("tensorflow" + 
File.separator + SCRIPT_FILE_NAME);
     private static final File DEFAULT_MODEL_FILE = new File("tensorflow" + 
File.separator + "tf-objectrec-model");
     private static final LineConsumer IGNORED_LINE_LOGGER = new LineConsumer() 
{

http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowRESTRecogniser.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowRESTRecogniser.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowRESTRecogniser.java
new file mode 100644
index 0000000..8722dee
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowRESTRecogniser.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.recognition.tf;
+
+import org.apache.http.HttpResponse;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.entity.ByteArrayEntity;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.recognition.ObjectRecogniser;
+import org.apache.tika.parser.recognition.RecognisedObject;
+import org.json.JSONArray;
+import org.json.JSONObject;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Tensor Flow image recogniser which has high performance.
+ * This implementation uses Tensorflow via REST API.
+ * <p>
+ * NOTE : //TODO: link to wiki page here
+ *
+ * @since Apache Tika 1.14
+ */
+public class TensorflowRESTRecogniser implements ObjectRecogniser {
+
+    /**
+     * Maximum buffer size for image
+     */
+    private static final Logger LOG = 
LoggerFactory.getLogger(TensorflowRESTRecogniser.class);
+    private static final String LABEL_LANG = "en";
+
+    @Field
+    private URI apiUri = 
URI.create("http://localhost:8764/inception/v3/classify?topk=10";);
+    @Field
+    private URI healthUri = 
URI.create("http://localhost:8764/inception/v3/ping";);
+
+    private boolean available;
+
+    @Override
+    public Set<MediaType> getSupportedMimes() {
+        return TensorflowImageRecParser.SUPPORTED_MIMES;
+    }
+
+    @Override
+    public boolean isAvailable() {
+        return available;
+    }
+
+    @Override
+    public void initialize(Map<String, Param> params) throws 
TikaConfigException {
+        try {
+            DefaultHttpClient client = new DefaultHttpClient();
+            HttpResponse response = client.execute(new HttpGet(healthUri));
+            available = response.getStatusLine().getStatusCode() == 200;
+            LOG.info("Available = {}, API Status = {}", available, 
response.getStatusLine());
+        } catch (Exception e) {
+            available = false;
+            throw new TikaConfigException(e.getMessage(), e);
+        }
+    }
+
+    @Override
+    public List<RecognisedObject> recognise(InputStream stream,
+                                            ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        List<RecognisedObject> recObjs = new ArrayList<>();
+        try {
+            DefaultHttpClient client = new DefaultHttpClient();
+
+            HttpPost request = new HttpPost(apiUri);
+
+            try (ByteArrayOutputStream byteStream = new 
ByteArrayOutputStream()) {
+                //TODO: convert this to stream, this might cause OOM issue
+                // InputStreamEntity is not working
+                // request.setEntity(new InputStreamEntity(stream, -1));
+                IOUtils.copy(stream, byteStream);
+                request.setEntity(new 
ByteArrayEntity(byteStream.toByteArray()));
+            }
+
+            HttpResponse response = client.execute(request);
+            try (InputStream reply = response.getEntity().getContent()) {
+                String replyMessage = IOUtils.toString(reply);
+                if (response.getStatusLine().getStatusCode() == 200) {
+                    JSONObject jReply = new JSONObject(replyMessage);
+                    JSONArray jClasses = jReply.getJSONArray("classnames");
+                    JSONArray jConfidence = jReply.getJSONArray("confidence");
+                    assert jClasses.length() == jConfidence.length();
+                    for (int i = 0; i < jClasses.length(); i++) {
+                        RecognisedObject recObj = new 
RecognisedObject(jClasses.getString(i),
+                                LABEL_LANG, jClasses.getString(i), 
jConfidence.getDouble(i));
+                        recObjs.add(recObj);
+                    }
+                } else {
+                    LOG.warn("Status = {}", response.getStatusLine());
+                    LOG.warn("Response = {}", replyMessage);
+                }
+            }
+        } catch (Exception e) {
+            LOG.warn(e.getMessage(), e);
+        }
+        LOG.debug("Num Objects found {}", recObjs.size());
+        return recObjs;
+    }
+}

http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-parsers/src/main/resources/org/apache/tika/parser/recognition/tf/InceptionRestDockerfile
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/resources/org/apache/tika/parser/recognition/tf/InceptionRestDockerfile
 
b/tika-parsers/src/main/resources/org/apache/tika/parser/recognition/tf/InceptionRestDockerfile
new file mode 100644
index 0000000..ea6df2b
--- /dev/null
+++ 
b/tika-parsers/src/main/resources/org/apache/tika/parser/recognition/tf/InceptionRestDockerfile
@@ -0,0 +1,39 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+FROM ubuntu
+
+MAINTAINER Thamme Gowda <[email protected]>
+
+# install missing part of ubuntu core + python stuff
+RUN apt-get update && \
+    apt-get install -y python-pip python-dev wget
+
+# Install tensorflow and other dependencies
+RUN \
+  pip install --upgrade 
https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-0.9.0-cp27-none-linux_x86_64.whl
 && \
+  pip install flask requests
+
+# TODO: Change the URL to Apache/Tika Repo when this PR gets merged
+RUN \
+  wget 
https://raw.githubusercontent.com/thammegowda/tika/TIKA-1993/tika-parsers/src/main/resources/org/apache/tika/parser/recognition/tf/inceptionapi.py
 -O /usr/bin/inceptionapi.py && \
+  chmod +x /usr/bin/inceptionapi.py
+
+# expose API port, this is the default port
+EXPOSE 8764
+
+# clean up cache, so we can publish smaller image to hub
+RUN apt-get clean
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-parsers/src/main/resources/org/apache/tika/parser/recognition/tf/inceptionapi.py
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/resources/org/apache/tika/parser/recognition/tf/inceptionapi.py
 
b/tika-parsers/src/main/resources/org/apache/tika/parser/recognition/tf/inceptionapi.py
new file mode 100755
index 0000000..723de62
--- /dev/null
+++ 
b/tika-parsers/src/main/resources/org/apache/tika/parser/recognition/tf/inceptionapi.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#    http://www.apache.org/licenses/LICENSE-2.0
+#  Unless required by applicable law or agreed to in writing,
+#  software distributed under the License is distributed on an
+#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+#  KIND, either express or implied.  See the License for the
+#  specific language governing permissions and limitations
+#  under the License.
+
+"""
+Image classification with Inception.
+
+This script exposes the tensorflow's inception classification service over 
REST API.
+
+For more details, visit:
+    https://tensorflow.org/tutorials/image_recognition/
+
+Requirements :
+  Flask
+  tensorflow
+  numpy
+  requests
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os.path
+import re
+import sys
+import tarfile
+
+import numpy as np
+from six.moves import urllib
+import tensorflow as tf
+import requests
+import json
+json.encoder.FLOAT_REPR = lambda o: format(o, '.2f') # JSON serialization of 
floats
+from time import time
+
+import flask
+
+
+FLAGS = tf.app.flags.FLAGS
+
+# classify_image_graph_def.pb:
+#   Binary representation of the GraphDef protocol buffer.
+# imagenet_synset_to_human_label_map.txt:
+#   Map from synset ID to a human readable string.
+# imagenet_2012_challenge_label_map_proto.pbtxt:
+#   Text representation of a protocol buffer mapping a label to synset ID.
+tf.app.flags.DEFINE_string(
+    'model_dir', '/tmp/imagenet',
+    """Path to classify_image_graph_def.pb, """
+    """imagenet_synset_to_human_label_map.txt, and """
+    """imagenet_2012_challenge_label_map_proto.pbtxt.""")
+tf.app.flags.DEFINE_integer('port', '8764', """Server PORT, default:8764""")
+tf.app.flags.DEFINE_string('log', 'inception.log', """Log file name, default: 
inception.log""")
+
+# pylint: disable=line-too-long
+DATA_URL = 
'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
+# pylint: enable=line-too-long
+
+class NodeLookup(object):
+    """Converts integer node ID's to human readable labels."""
+
+    def __init__(self,
+                 label_lookup_path=None,
+                 uid_lookup_path=None):
+        if not label_lookup_path:
+            label_lookup_path = os.path.join(
+                FLAGS.model_dir, 
'imagenet_2012_challenge_label_map_proto.pbtxt')
+        if not uid_lookup_path:
+            uid_lookup_path = os.path.join(
+                FLAGS.model_dir, 'imagenet_synset_to_human_label_map.txt')
+        self.node_lookup = self.load(label_lookup_path, uid_lookup_path)
+
+    def load(self, label_lookup_path, uid_lookup_path):
+        """Loads a human readable English name for each softmax node.
+
+    Args:
+      label_lookup_path: string UID to integer node ID.
+      uid_lookup_path: string UID to human-readable string.
+
+    Returns:
+      dict from integer node ID to human-readable string.
+    """
+        if not tf.gfile.Exists(uid_lookup_path):
+            tf.logging.fatal('File does not exist %s', uid_lookup_path)
+        if not tf.gfile.Exists(label_lookup_path):
+            tf.logging.fatal('File does not exist %s', label_lookup_path)
+
+        # Loads mapping from string UID to human-readable string
+        proto_as_ascii_lines = tf.gfile.GFile(uid_lookup_path).readlines()
+        uid_to_human = {}
+        p = re.compile(r'[n\d]*[ \S,]*')
+        for line in proto_as_ascii_lines:
+            parsed_items = p.findall(line)
+            uid = parsed_items[0]
+            human_string = parsed_items[2]
+            uid_to_human[uid] = human_string
+
+        # Loads mapping from string UID to integer node ID.
+        node_id_to_uid = {}
+        proto_as_ascii = tf.gfile.GFile(label_lookup_path).readlines()
+        for line in proto_as_ascii:
+            if line.startswith('  target_class:'):
+                target_class = int(line.split(': ')[1])
+            if line.startswith('  target_class_string:'):
+                target_class_string = line.split(': ')[1]
+                node_id_to_uid[target_class] = target_class_string[1:-2]
+
+        # Loads the final mapping of integer node ID to human-readable string
+        node_id_to_name = {}
+        for key, val in node_id_to_uid.items():
+            if val not in uid_to_human:
+                tf.logging.fatal('Failed to locate: %s', val)
+            name = uid_to_human[val]
+            node_id_to_name[key] = name
+
+        return node_id_to_name
+
+    def id_to_string(self, node_id):
+        if node_id not in self.node_lookup:
+            return ''
+        return self.node_lookup[node_id]
+
+def create_graph():
+    """Creates a graph from saved GraphDef file and returns a saver."""
+    # Creates graph from saved graph_def.pb.
+    with tf.gfile.FastGFile(os.path.join(
+            FLAGS.model_dir, 'classify_image_graph_def.pb'), 'rb') as f:
+        graph_def = tf.GraphDef()
+        graph_def.ParseFromString(f.read())
+        _ = tf.import_graph_def(graph_def, name='')
+
+
+def maybe_download_and_extract():
+    """Download and extract model tar file."""
+    dest_directory = FLAGS.model_dir
+    if not os.path.exists(dest_directory):
+        os.makedirs(dest_directory)
+    filename = DATA_URL.split('/')[-1]
+    filepath = os.path.join(dest_directory, filename)
+    if not os.path.exists(filepath):
+        def _progress(count, block_size, total_size):
+            sys.stdout.write('\r>> Downloading %s %.1f%%' % (
+                filename, float(count * block_size) / float(total_size) * 
100.0))
+            sys.stdout.flush()
+
+        filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
+        print()
+        statinfo = os.stat(filepath)
+        print('Succesfully downloaded', filename, statinfo.st_size, 'bytes.')
+    tarfile.open(filepath, 'r:gz').extractall(dest_directory)
+
+def current_time():
+    """
+        Returns current time in milli seconds
+    """
+    return int(1000 * time())
+
+class Classifier(flask.Flask):
+    '''
+    Classifier Service class
+    '''
+    def __init__(self, name):
+        super(Classifier, self).__init__(name)
+        maybe_download_and_extract()
+        create_graph()
+        self.sess = tf.Session()
+        self.softmax_tensor = self.sess.graph.get_tensor_by_name('softmax:0')
+        self.node_lookup = NodeLookup()
+        print("Logs are directed to %s" % FLAGS.log)
+        import logging
+        from logging.handlers import RotatingFileHandler
+        file_handler = RotatingFileHandler(FLAGS.log, maxBytes=1024 * 1024 * 
100, backupCount=20)
+        file_handler.setLevel(logging.INFO)
+        formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s 
- %(message)s")
+        file_handler.setFormatter(formatter)
+        self.logger.addHandler(file_handler)
+
+    def classify(self, image_data, topk):
+        predictions = self.sess.run(self.softmax_tensor,
+                                    {'DecodeJpeg/contents:0': image_data})
+        predictions = np.squeeze(predictions)
+        top_k = predictions.argsort()[-topk:][::-1]
+        res = []
+        for node_id in top_k:
+            class_name = self.node_lookup.id_to_string(node_id)
+            score = float(predictions[node_id])
+            res.append((node_id, class_name, score))
+        return res
+
+
+from flask import Flask, request, abort, g, Response, jsonify
+app = Classifier(__name__)
+
+def get_remotefile(url, success=200, timeout=10):
+    """
+        Given HTTP URL, this api gets the content of it
+        returns (Content-Type, image_content)
+    """
+    try:
+        app.logger.info("GET: %s" % url)
+        auth = None
+        res = requests.get(url, stream=True, timeout=timeout, auth=auth)
+        if res.status_code == success:
+            return res.headers.get('Content-Type', 
'application/octet-stream'), res.raw.data
+    except:
+        pass
+    return None, None
+
[email protected]("/")
+def index():
+    """
+        The index page which provide information about other API end points
+    """
+    return """
+    <div>
+    <h1> Inception REST API </h1>
+    <h3> The following API end points are valid </h3>
+        <ul>
+            <h4> Inception V3 </h4>
+            <li> <code>/inception/v3/classes </code> - <br/>
+                <b> Description : </b> This API gets all classes/object types 
known to the current model
+            </li>
+            <li> <code>/inception/v3/ping </code> - <br/>
+                <b> Description : </b> checks availability of the service. 
returns "pong" with status 200 when it is available
+            </li>
+            <li> <code>/inception/v3/classify</code> - <br/>
+                <table>
+                <tr><th align="left"> Description </th><td> This is a 
classifier service that can classify images</td></tr>
+                <tr><td></td> <td>Query Params : <br/>
+                   <code>topk </code>: type = int : top classes to get; 
default : 10 <br/>
+                   <code>human </code>: type = boolean : human readable class 
names; default : true <br/>
+                 </td></tr>
+                <tr><th align="left"> How to supply Image Content </th></tr>
+                <tr><th align="left"> With HTTP GET : </th> <td>
+                    Include a query parameter <code>url </code> which is an 
http url of JPEG image <br/>
+                    Example: <code> curl 
"localhost:8764/inception/v3/classify?url=http://xyz.com/example.jpg";</code>
+                </td></tr>
+                <tr><th align="left"> With HTTP POST :</th><td>
+                    POST JPEG image content as binary data in request body. 
<br/>
+                    Example: <code> curl -X POST 
"localhost:8764/inception/v3/classify?topk=10&human=false" --data-binary 
@example.jpg </code>
+                </td></tr>
+                </table>
+            </li>
+        <ul>
+    </div>
+    """
+
[email protected]("/inception/v3/classes", methods=["GET"])
+def get_classes():
+    """API to list all known classes
+    """
+    return jsonify(app.node_lookup.node_lookup)
+
[email protected]("/inception/v3/ping", methods=["GET"])
+def ping_pong():
+    """API to do health check. If this says status code 200, then healthy
+    """
+    return "pong"
+
[email protected]("/inception/v3/classify", methods=["GET", "POST"])
+def classify_image():
+    """
+    API to classify images
+    """
+    st = current_time()
+    topk = int(request.args.get("topk", "10"))
+    human = request.args.get("human", "true").lower() in ("true", "1", "yes")
+    if request.method == 'POST':
+        image_data = request.get_data()
+    else:
+        url = request.args.get("url")
+        c_type, image_data = get_remotefile(url)
+        if not image_data:
+            return flask.Response(status=400, response=jsonify(error="Couldnot 
HTTP GET %s" % url))
+        if 'image/jpeg' not in c_type:
+            return flask.Response(status=400, response=jsonify(error="Content 
of %s is not JPEG" % url))
+    read_time = current_time() - st
+    st = current_time() # reset start time
+    try:
+        classes = app.classify(image_data=image_data, topk=topk)
+    except Exception as e:
+        app.logger.error(e)
+        return Response(status=400, response=str(e))
+    classids, classnames, confidence = zip(*classes)
+    classifier_time = current_time() - st
+    app.logger.info("Classifier time : %d" % classifier_time)
+    res = {
+        'classids' : classids,
+        'confidence': confidence,
+        'time': {
+            'read' : read_time,
+            'classification': classifier_time,
+            'units': 'ms'
+        }
+    }
+    if human:
+        res['classnames'] = classnames
+    return Response(response=json.dumps(res), status=200, 
mimetype="application/json")
+
+def main(_):
+    if not app.debug:
+        print("Serving on port %d" % FLAGS.port)
+    app.run(port=FLAGS.port)
+
+if __name__ == '__main__':
+    tf.app.run()

http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
index d94eea6..aaa458b 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java
@@ -21,15 +21,14 @@ import org.apache.tika.Tika;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.junit.Assert;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.SAXException;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
+import java.io.*;
 import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
@@ -40,6 +39,8 @@ import java.util.List;
 public class ObjectRecognitionParserTest {
 
     private static final String CONFIG_FILE = 
"org/apache/tika/parser/recognition/tika-config-tflow.xml";
+    private static final String CONFIG_ADDON_FILE = 
"org/apache/tika/parser/recognition/tika-config-tflow-addon.xml";
+    private static final String CONFIG_REST_FILE = 
"org/apache/tika/parser/recognition/tika-config-tflow-rest.xml";
     private static final String CAT_IMAGE = "test-documents/testJPEG.jpg";
     private static final ClassLoader loader = 
ObjectRecognitionParserTest.class.getClassLoader();
 
@@ -62,7 +63,50 @@ public class ObjectRecognitionParserTest {
                     Assert.assertTrue(message, text.contains(expectedObject));
                     Assert.assertTrue(message, 
metaValues.contains(expectedObject));
                 }
-                System.out.println(metadata);
+            }
+        }
+    }
+
+    @Ignore("Configure addon path in tika-config.xml")
+    @Test
+    public void testAddonJar() throws Exception {
+
+        try (InputStream stream = 
loader.getResourceAsStream(CONFIG_ADDON_FILE)){
+            assert stream != null;
+            Tika tika = new Tika(new TikaConfig(stream));
+            Metadata metadata = new Metadata();
+            try (InputStream imageStream = 
loader.getResourceAsStream(CAT_IMAGE)){
+                Reader reader = tika.parse(imageStream, metadata);
+                List<String> lines = IOUtils.readLines(reader);
+                String text = StringUtils.join(lines, " ");
+                String[] expectedObjects = {"Egyptian cat", "tabby cat"};
+                String metaValues = 
StringUtils.join(metadata.getValues(ObjectRecognitionParser.MD_KEY), " ");
+                for (String expectedObject : expectedObjects) {
+                    String message = "'" + expectedObject + "' must have been 
detected";
+                    Assert.assertTrue(message, text.contains(expectedObject));
+                    Assert.assertTrue(message, 
metaValues.contains(expectedObject));
+                }
+            }
+        }
+    }
+
+    @Ignore("Configure Rest API service")
+    @Test
+    public void testREST() throws Exception {
+        try (InputStream stream = 
loader.getResourceAsStream(CONFIG_REST_FILE)){
+            assert stream != null;
+            Tika tika = new Tika(new TikaConfig(stream));
+            Metadata metadata = new Metadata();
+            try (InputStream imageStream = 
loader.getResourceAsStream(CAT_IMAGE)){
+                Reader reader = tika.parse(imageStream, metadata);
+                String text = IOUtils.toString(reader);
+                String[] expectedObjects = {"Egyptian cat", "tabby cat"};
+                String metaValues = 
StringUtils.join(metadata.getValues(ObjectRecognitionParser.MD_KEY), " ");
+                for (String expectedObject : expectedObjects) {
+                    String message = "'" + expectedObject + "' must have been 
detected";
+                    Assert.assertTrue(message, text.contains(expectedObject));
+                    Assert.assertTrue(message, 
metaValues.contains(expectedObject));
+                }
             }
         }
     }

http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-addon.xml
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-addon.xml
 
b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-addon.xml
new file mode 100644
index 0000000..349c7d4
--- /dev/null
+++ 
b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-addon.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<properties>
+    <parsers>
+        <parser 
class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
+            <mime>image/jpeg</mime>
+            <params>
+                <param name="topN" type="int">5</param>
+                <param name="minConfidence" type="double">0.015</param>
+                <param name="class" 
type="string">org.apache.tika.parser.recognition.tf.TensorflowGrpcRecogniser</param>
+                <param name="addon" 
type="file">../tensorflow-java-1.0-jar-with-dependencies.jar</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/tika/blob/9e0a87e6/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
 
b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
new file mode 100644
index 0000000..ad72c95
--- /dev/null
+++ 
b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow-rest.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~    http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+<properties>
+    <parsers>
+        <parser 
class="org.apache.tika.parser.recognition.ObjectRecognitionParser">
+            <mime>image/jpeg</mime>
+            <params>
+                <param name="topN" type="int">7</param>
+                <param name="minConfidence" type="double">0.015</param>
+                <param name="class" 
type="string">org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
\ No newline at end of file

Reply via email to