Object recognition parser, tensorflow based implementation, and test cases for these
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/2184e2c2 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/2184e2c2 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/2184e2c2 Branch: refs/heads/TIKA-1508 Commit: 2184e2c2c2a0e507be6be4f9692e0fab5b38a476 Parents: eccc153 Author: Thamme Gowda <[email protected]> Authored: Sat Jun 11 19:04:49 2016 -0700 Committer: Thamme Gowda <[email protected]> Committed: Sat Jun 11 19:04:49 2016 -0700 ---------------------------------------------------------------------- .../parser/recognition/ObjectRecogniser.java | 73 +++++++++ .../recognition/ObjectRecognitionParser.java | 143 ++++++++++++++++++ .../parser/recognition/RecognisedObject.java | 91 +++++++++++ .../tf/TensorflowImageRecParser.java | 149 +++++++++++++++++++ .../ObjectRecognitionParserTest.java | 68 +++++++++ .../tf/TensorflowImageRecParserTest.java | 56 +++++++ .../parser/recognition/tika-config-tflow.xml | 29 ++++ 7 files changed, 609 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecogniser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecogniser.java b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecogniser.java new file mode 100644 index 0000000..3776c1e --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecogniser.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.recognition; + +import org.apache.tika.base.Configurable; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.recognition.tf.TensorflowImageRecParser; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Set; + +/** + * This is a contract for object recognisers used by {@link ObjectRecognitionParser} + * @see {@link TensorflowImageRecParser} for an example + */ +public interface ObjectRecogniser extends Configurable { + + /** + * The mimes supported by this recogniser + * @return set of mediatypes + */ + Set<MediaType> getSupportedMimes(); + + /** + * Is this service available + * @return {@code true} when the service is available, {@code false} otherwise + */ + boolean isAvailable(); + + /** + * This is the hook for configuring the recogniser + * @param context configuration instance in the form of context + * @throws TikaConfigException when there is an issue with configuration + */ + void configure(ParseContext context) throws TikaConfigException; + + /** + * Recognise the objects in the stream + * @param stream content stream + * @param handler tika's content handle + * @param metadata metadata instance + * @param context parser context + * @return List of {@link RecognisedObject}s + * @throws IOException when an I/O error occurs + * @throws SAXException when an issue with XML occurs + * @throws TikaException any generic error + */ + List<RecognisedObject> recognise(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException; +} http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java new file mode 100644 index 0000000..274f884 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/ObjectRecognitionParser.java @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.recognition; + +import org.apache.tika.config.Field; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.recognition.tf.TensorflowImageRecParser; +import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.AnnotationUtils; +import org.apache.tika.utils.ServiceLoaderUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Set; + + +/** + * This parser recognises objects from Images. + * The Object Recognition implementation can be switched using 'class' argument. + * + * <b>Example Usage : </b> + * <pre> + * <properties> + * <parsers> + * <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser"> + * <mime>image/jpeg</mime> + * <params> + * <param name="topN" type="int">2</param> + * <param name="minConfidence" type="double">0.015</param> + * <param name="class" type="string">org.apache.tika.parser.recognition.tf.TensorflowImageRecParser</param> + * </params> + * </parser> + * </parsers> + * </properties> + * </pre> + * + * @since Apache Tika 1.14 + */ +public class ObjectRecognitionParser extends AbstractParser { + + public static final Logger LOG = LoggerFactory.getLogger(ObjectRecognitionParser.class); + public static final String MD_KEY = "OBJECT"; + private static final Comparator<RecognisedObject> DESC_CONFIDENCE_SORTER = + new Comparator<RecognisedObject>() { + @Override + public int compare(RecognisedObject o1, RecognisedObject o2) { + return Double.compare(o2.getConfidence(), o1.getConfidence()); + } + }; + + @Field private double minConfidence = 0.05; + + @Field private int topN = 2; + + private ObjectRecogniser recogniser = new TensorflowImageRecParser(); + + @Field(name = "class") + public void setRecogniser(String recogniserClass) { + this.recogniser = ServiceLoaderUtils.newInstance(recogniserClass); + } + + @Override + public void configure(ParseContext context) throws TikaConfigException { + super.configure(context); + AnnotationUtils.assignFieldParams(recogniser, context.getParams()); + recogniser.configure(context); + LOG.info("minConfidence = {}, topN={}", minConfidence, topN); + LOG.info("Recogniser = {}", recogniser.getClass().getName()); + LOG.info("Recogniser Available = {}", recogniser.isAvailable()); + } + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return recogniser.isAvailable() ? recogniser.getSupportedMimes() : Collections.<MediaType>emptySet(); + } + + @Override + public synchronized void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + if (!recogniser.isAvailable()) { + LOG.warn("{} is not available for service", recogniser.getClass()); + return; + } + metadata.set("object.rec.impl", recogniser.getClass().getSimpleName()); + long start = System.currentTimeMillis(); + List<RecognisedObject> objects = recogniser.recognise(stream, handler, metadata, context); + LOG.debug("Found {} objects", objects != null ? objects.size() : 0); + LOG.debug("Time taken {}ms", System.currentTimeMillis() - start); + if (objects != null && !objects.isEmpty()){ + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startElement("ol", "id", "objects"); + Collections.sort(objects, DESC_CONFIDENCE_SORTER); + int count = 0; + for (RecognisedObject object : objects) { + if (object.getConfidence() >= minConfidence) { + LOG.debug("Add {}", object); + count++; + metadata.add(MD_KEY, object.getLabel()); + //writing to handler + xhtml.startElement("li", "id", object.getId()); + String text = String.format(" %s [%s](confidence = %f )", + object.getLabel(), object.getLabelLang(), object.getConfidence()); + xhtml.characters(text); + xhtml.endElement("li"); + if (count >= topN) { + break; + } + } + } + xhtml.endElement("ol"); + } else { + LOG.warn("NO objects"); + metadata.add("no.objects", Boolean.TRUE.toString()); + } + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/main/java/org/apache/tika/parser/recognition/RecognisedObject.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/RecognisedObject.java b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/RecognisedObject.java new file mode 100644 index 0000000..d0317c8 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/RecognisedObject.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.recognition; + +/** + * A model for recognised objects from graphics and texts typically includes + * human readable label for the object, language of the label, id and confidence score. + * + * @since Apache Tika 1.14 + */ +public class RecognisedObject { + + /** + * Label of this object. Usually the name given to this object by humans + */ + private String label; + /** + * Language of label, Example : english + */ + private String labelLang; + /** + * Identifier for this object + */ + private String id; + /** + * Confidence score + */ + private double confidence; + + public RecognisedObject(String label, String labelLang, String id, double confidence) { + this.label = label; + this.labelLang = labelLang; + this.id = id; + this.confidence = confidence; + } + + public String getLabel() { + return label; + } + + public void setLabel(String label) { + this.label = label; + } + + public String getLabelLang() { + return labelLang; + } + + public void setLabelLang(String labelLang) { + this.labelLang = labelLang; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public double getConfidence() { + return confidence; + } + + public void setConfidence(double confidence) { + this.confidence = confidence; + } + + @Override + public String toString() { + return "RecognisedObject{" + + "label='" + label + "\' (" + labelLang + ')' + + ", id='" + id + '\'' + + ", confidence=" + confidence + + '}'; + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java new file mode 100644 index 0000000..eb1d536 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParser.java @@ -0,0 +1,149 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.recognition.tf; + +import org.apache.http.HttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.DefaultHttpClient; +import org.apache.tika.config.Field; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.parser.recognition.ObjectRecogniser; +import org.apache.tika.parser.recognition.RecognisedObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.*; +import java.net.URI; +import java.util.*; +import java.util.regex.Pattern; + +/** + * This is an implementation of {@link ObjectRecogniser} powered by <a href="http://www.tensorflow.org"> Tensorflow <a/> + * convolutional neural network (CNN). This implementation binds to Python API using {@link ExternalParser}. + * <br/> + * // NOTE: This is a proof of concept for an efficient implementation using JNI binding to Tensorflow's C++ api. + * + * <br/> + * <p> + * <b>Environment Setup:</b> + * <ol> + * <li> Python must be available </li> + * <li> Tensorflow must be available for import by the python script. <a href="https://www.tensorflow.org/versions/r0.9/get_started/os_setup.html#pip-installation"> Setup Instructions here </a></li> + * <li> All dependencies of tensor flow (such as numpy) must also be available. <a href="https://www.tensorflow.org/versions/r0.9/tutorials/image_recognition/index.html#image-recognition">Follow the image recognition guide and make sure it works</a></li> + * </ol> + * </p> + * @since Apache Tika 1.14 + */ +public class TensorflowImageRecParser extends ExternalParser implements ObjectRecogniser { + + private static final Logger LOG = LoggerFactory.getLogger(TensorflowImageRecParser.class); + + private static final URI defaultScriptUri = URI.create("https://raw.githubusercontent.com/tensorflow/tensorflow/122cdce33e3e0a01a7f82645617317530aa571fb/tensorflow/models/image/imagenet/classify_image.py"); + private static final Set<MediaType> supportedMimes = Collections.singleton(MediaType.image("jpeg")); + private static final File defaultScriptFile = new File("tensorflow/tf-objectrec.py"); + private static final File defaultModelFile = new File("tensorflow/tf-objectrec-model"); + private static final LineConsumer ignoredLineLogger = new LineConsumer() { + @Override + public void consume(String line) { + LOG.debug(line); + } + }; + + @Field private URI scriptUri = defaultScriptUri; + @Field private String executor = "python"; + @Field private File scriptFile = defaultScriptFile; + @Field private String modelArg = "--model_dir"; + @Field private File modelFile = defaultModelFile; + @Field private String imageArg = "--image_file"; + @Field private String outPattern = "(.*) \\(score = ([0-9]+\\.[0-9]+)\\)$"; + @Field private String availabilityTestArgs = ""; //when no args are given, the script will test itself! + + private boolean available = false; + + public Set<MediaType> getSupportedMimes() { + return supportedMimes; + } + + @Override + public boolean isAvailable() { + return available; + } + + @Override + public void configure(ParseContext context) throws TikaConfigException { + super.configure(context); + try { + if (!modelFile.exists()) { + modelFile.getParentFile().mkdirs(); + LOG.warn("Model doesn't exist at {}. Expecting the script to download it.", modelFile); + } + if (!scriptFile.exists()) { + scriptFile.getParentFile().mkdirs(); + LOG.info("GET : {} -> {}", scriptUri, scriptFile); + DefaultHttpClient httpClient = new DefaultHttpClient(); + HttpGet getMethod = new HttpGet(scriptUri); + try (BufferedOutputStream stream = new BufferedOutputStream(new FileOutputStream(scriptFile))) { + HttpResponse response = httpClient.execute(getMethod); + IOUtils.copy(response.getEntity().getContent(), stream); + } + LOG.debug("Downloaded.."); + } + String[] availabilityCheckArgs = {executor, scriptFile.getAbsolutePath(), + modelArg, modelFile.getAbsolutePath(), availabilityTestArgs}; + available = ExternalParser.check(availabilityCheckArgs); + LOG.debug("Available? {}", available); + if (!available) { + return; + } + String[] parseCmd = { + executor, scriptFile.getAbsolutePath(), + modelArg, modelFile.getAbsolutePath(), + imageArg, INPUT_FILE_TOKEN, + "--out_file", OUTPUT_FILE_TOKEN}; //inserting output token to let external parser parse metadata + setCommand(parseCmd); + HashMap<Pattern, String> patterns = new HashMap<>(); + patterns.put(Pattern.compile(outPattern), null); + setMetadataExtractionPatterns(patterns); + setIgnoredLineConsumer(ignoredLineLogger); + } catch (Exception e) { + throw new TikaConfigException(e.getMessage(), e); + } + } + + @Override + public List<RecognisedObject> recognise(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + Metadata md = new Metadata(); + parse(stream, handler, md, context); + List<RecognisedObject> objects = new ArrayList<>(); + for (String key: md.names()) { + double confidence = Double.parseDouble(md.get(key)); + objects.add(new RecognisedObject(key, "eng", key, confidence)); + } + return objects; + } +} + http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java new file mode 100644 index 0000000..fc96b1d --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/recognition/ObjectRecognitionParserTest.java @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.recognition; + +import org.apache.commons.lang.StringUtils; +import org.apache.tika.Tika; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; + +/** + * Testcases for Object Recognition Parser + */ +public class ObjectRecognitionParserTest { + + private static final String CONFIG_FILE = "org/apache/tika/parser/recognition/tika-config-tflow.xml"; + private static final String CAT_IMAGE = "test-documents/testJPEG.jpg"; + private static final ClassLoader loader = ObjectRecognitionParserTest.class.getClassLoader(); + + @Ignore("If tensorflow not available Ignore") @Test + public void jpegTesorflowTest() throws IOException, TikaException, SAXException { + + try(InputStream stream = loader.getResourceAsStream(CONFIG_FILE)){ + assert stream != null; + Tika tika = new Tika(new TikaConfig(stream)); + Metadata metadata = new Metadata(); + try (InputStream imageStream = loader.getResourceAsStream(CAT_IMAGE)){ + Reader reader = tika.parse(imageStream, metadata); + List<String> lines = IOUtils.readLines(reader); + String text = StringUtils.join(lines, " "); + String[] expectedObjects = {"Egyptian cat", "Border collie"}; + HashSet<String> objects = new HashSet<>(); + objects.addAll(Arrays.asList(metadata.getValues(ObjectRecognitionParser.MD_KEY))); + for (String expectedObject : expectedObjects) { + String message = "'" + expectedObject + "' must have been detected"; + Assert.assertTrue(message, text.contains(expectedObject)); + Assert.assertTrue(message, objects.contains(expectedObject)); + } + } + } + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParserTest.java new file mode 100644 index 0000000..038f2d3 --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/recognition/tf/TensorflowImageRecParserTest.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.recognition.tf; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.recognition.RecognisedObject; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; +import org.xml.sax.helpers.DefaultHandler; + +import java.io.InputStream; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + + +@Ignore +public class TensorflowImageRecParserTest { + + @Test + public void recognise() throws Exception { + TensorflowImageRecParser recogniser = new TensorflowImageRecParser(); + recogniser.configure(new ParseContext()); + try (InputStream stream = getClass().getClassLoader().getResourceAsStream("test-documents/testJPEG.jpg")) { + List<RecognisedObject> objects = recogniser.recognise(stream, new DefaultHandler(), new Metadata(), new ParseContext()); + Assert.assertTrue(5 == objects.size()); + Set<String> objectLabels = new HashSet<>(); + for (RecognisedObject object : objects) { + objectLabels.add(object.getLabel()); + } + System.out.println(objectLabels); + String[] expected = {"English foxhound", "Egyptian cat", "collie", "Border collie"}; + for (String label : expected) { + Assert.assertTrue(label + " is expected", objectLabels.contains(label)); + } + } + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/tika/blob/2184e2c2/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow.xml ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow.xml b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow.xml new file mode 100644 index 0000000..f848d15 --- /dev/null +++ b/tika-parsers/src/test/resources/org/apache/tika/parser/recognition/tika-config-tflow.xml @@ -0,0 +1,29 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<properties> + <parsers> + <parser class="org.apache.tika.parser.recognition.ObjectRecognitionParser"> + <mime>image/jpeg</mime> + <params> + <param name="topN" type="int">2</param> + <param name="minConfidence" type="double">0.015</param> + <param name="class" type="string">org.apache.tika.parser.recognition.tf.TensorflowImageRecParser</param> + </params> + </parser> + </parsers> +</properties> \ No newline at end of file
