Author: mattmann
Date: Fri Nov 27 22:46:09 2015
New Revision: 1716927

URL: http://svn.apache.org/viewvc?rev=1716927&view=rev
Log:
Fix for TIKA-1798 Parser for Video Similarity using PooledTimeSeries metric 
contributed by Aditya Dhulipala and Chris Mattmann this closes #64.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pot/
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java
Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1716927&r1=1716926&r2=1716927&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Fri Nov 27 22:46:09 2015
@@ -1,5 +1,9 @@
 Release 1.12 - Current Development
 
+  * A parser to compute motion properties in Videos, e.g., 
+    Histogram of Oriented Gradients and Histogram of Optical Flows
+    using the Pooled Time Series algorithm, was added (TIKA-1798).
+
   * Provide NamedEntityParser which exposes Named Entity Recognition
     from OpenNLP and Stanford NER providers (TIKA-1787, GitHub-61,
     GitHub-62).

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java?rev=1716927&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java
 Fri Nov 27 22:46:09 2015
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pot;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.BufferedReader;
+import java.util.logging.Logger;
+import org.apache.commons.exec.CommandLine;
+import org.apache.commons.exec.DefaultExecutor;
+import org.apache.commons.exec.ExecuteWatchdog;
+import org.apache.commons.exec.PumpStreamHandler;
+import org.apache.commons.exec.environment.EnvironmentUtils;
+import org.xml.sax.helpers.AttributesImpl;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class PooledTimeSeriesParser extends AbstractParser {
+
+  private static final long serialVersionUID = -2855917932512164988L;
+  private static final Set<MediaType> SUPPORTED_TYPES = Collections
+      .unmodifiableSet(new HashSet<MediaType>(Arrays.asList(new MediaType[] {
+          MediaType.video("avi"), MediaType.video("mp4")
+      // TODO: Add all supported video types
+          })));
+
+  private static final Logger LOG = 
Logger.getLogger(PooledTimeSeriesParser.class.getName());
+
+  public boolean isAvailable() {
+    return ExternalParser.check(
+        new String[] { "pooled-time-series", "--help" }, -1);
+  }
+
+  /**
+   * Returns the set of media types supported by this parser when used with the
+   * given parse context.
+   *
+   * @param context
+   *          parse context
+   * @return immutable set of media types
+   * @since Apache Tika 0.7
+   */
+  @Override
+  public Set<MediaType> getSupportedTypes(ParseContext context) {
+    return SUPPORTED_TYPES;
+  }
+
+  /**
+   * Parses a document stream into a sequence of XHTML SAX events. Fills in
+   * related document metadata in the given metadata object.
+   * <p>
+   * The given document stream is consumed but not closed by this method. The
+   * responsibility to close the stream remains on the caller.
+   * <p>
+   * Information about the parsing context can be passed in the context
+   * parameter. See the parser implementations for the kinds of context
+   * information they expect.
+   *
+   * @param stream
+   *          the document stream (input)
+   * @param handler
+   *          handler for the XHTML SAX events (output)
+   * @param metadata
+   *          document metadata (input and output)
+   * @param context
+   *          parse context
+   * @throws IOException
+   *           if the document stream could not be read
+   * @throws SAXException
+   *           if the SAX events could not be processed
+   * @throws TikaException
+   *           if the document could not be parsed
+   * @since Apache Tika 0.5
+   */
+  @Override
+  public void parse(InputStream stream, ContentHandler handler,
+      Metadata metadata, ParseContext context) throws IOException,
+      SAXException, TikaException {
+
+    if (!isAvailable()) {
+      LOG.warning(
+          "PooledTimeSeries not installed!");
+      return;
+    }
+
+    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+    TemporaryResources tmp = new TemporaryResources();
+    File output = null;
+    try {
+      TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+      File input = tikaStream.getFile();
+      String cmdOutput = computePoT(input);
+      FileInputStream ofStream = new FileInputStream(new File(
+          input.getAbsoluteFile() + ".of.txt"));
+      FileInputStream ogStream = new FileInputStream(new File(
+          input.getAbsoluteFile() + ".hog.txt"));
+      extractHeaderOutput(ofStream, metadata, "of");
+      extractHeaderOutput(ogStream, metadata, "og");
+      xhtml.startDocument();
+      doExtract(ofStream, xhtml, "Histogram of Optical Flows (HOF)",
+          metadata.get("of_frames"), metadata.get("of_vecSize"));
+      doExtract(ogStream, xhtml, "Histogram of Oriented Gradients (HOG)",
+          metadata.get("og_frames"), metadata.get("og_vecSize"));
+      xhtml.endDocument();
+
+    } finally {
+      tmp.dispose();
+      if (output != null) {
+        output.delete();
+      }
+    }
+  }
+
+  private String computePoT(File input)
+      throws IOException, TikaException {
+
+    CommandLine cmdLine = new CommandLine("pooled-time-series");
+    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+    cmdLine.addArgument("-f");
+    cmdLine.addArgument(input.getAbsolutePath());
+    LOG.fine("Executing: " + cmdLine);
+    DefaultExecutor exec = new DefaultExecutor();
+    exec.setExitValue(0);
+    ExecuteWatchdog watchdog = new ExecuteWatchdog(60000);
+    exec.setWatchdog(watchdog);
+    PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
+    exec.setStreamHandler(streamHandler);
+    int exitValue = exec
+        .execute(cmdLine, EnvironmentUtils.getProcEnvironment());
+    return outputStream.toString("UTF-8");
+
+  }
+
+  /**
+   * Reads the contents of the given stream and write it to the given XHTML
+   * content handler. The stream is closed once fully processed.
+   *
+   * @param stream
+   *          Stream where is the result of ocr
+   * @param xhtml
+   *          XHTML content handler
+   * @param tableTitle
+   *          The name of the matrix/table to display.
+   * @param frames
+   *          Number of frames read from the video.
+   * @param vecSize
+   *          Size of the OF or HOG vector.
+   * @throws SAXException
+   *           if the XHTML SAX events could not be handled
+   * @throws IOException
+   *           if an input error occurred
+   */
+  private void doExtract(InputStream stream, XHTMLContentHandler xhtml,
+      String tableTitle, String frames, String vecSize) throws SAXException,
+      IOException {
+    BufferedReader reader = new BufferedReader(new InputStreamReader(stream,
+        UTF_8));
+    String line = null;
+    AttributesImpl attributes = new AttributesImpl();
+    attributes.addAttribute("", "", "rows", "CDATA", frames);
+    attributes.addAttribute("", "", "cols", "CDATA", vecSize);
+
+    xhtml.startElement("h3");
+    xhtml.characters(tableTitle);
+    xhtml.endElement("h3");
+    xhtml.startElement("table", attributes);
+    while ((line = reader.readLine()) != null) {
+      xhtml.startElement("tr");
+      for (String val : line.split(" ")) {
+        xhtml.startElement("td");
+        xhtml.characters(val);
+        xhtml.endElement("td");
+      }
+      xhtml.endElement("tr");
+    }
+    xhtml.endElement("table");
+  }
+
+  private void extractHeaderOutput(InputStream stream, Metadata metadata,
+      String prefix) throws IOException {
+    BufferedReader reader = new BufferedReader(new InputStreamReader(stream,
+        UTF_8));
+    String line = reader.readLine();
+    String[] firstLine = line.split(" ");
+    String frames = firstLine[0];
+    String vecSize = firstLine[1];
+
+    if (prefix == null) {
+      prefix = "";
+    }
+    metadata.add(prefix + "_frames", frames);
+    metadata.add(prefix + "_vecSize", vecSize);
+  }
+
+}

Modified: 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1716927&r1=1716926&r2=1716927&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 (original)
+++ 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 Fri Nov 27 22:46:09 2015
@@ -60,6 +60,7 @@ org.apache.tika.parser.code.SourceCodePa
 org.apache.tika.parser.mat.MatParser
 org.apache.tika.parser.ocr.TesseractOCRParser
 org.apache.tika.parser.gdal.GDALParser
+org.apache.tika.parser.pot.PooledTimeSeriesParser
 org.apache.tika.parser.grib.GribParser
 org.apache.tika.parser.jdbc.SQLite3Parser
 org.apache.tika.parser.isatab.ISArchiveParser


Reply via email to