Re: unit test error for new parser

Tyler Palsulich Wed, 04 Jun 2014 11:03:06 -0700

Hi Annie,
I put together a patch to work your parser into Tika under
org.apache.tika.parsers.mat. You'll need to put a valid matlab file (I
called it MatlabFile.m, but just update the test) under test-documents for
the parsing in the test to work properly -- the file I had failed because
of "om.jmatio.io.MatlabIOException: This is not a valid MATLAB 5.0
MAT-file."
To apply the patch, run `patch -p0 -i matlab_parsing.diff` in the root of
the tika trunk. Run the test with `mvn test -DfailIfNoTests=false
-Dtest=org.apache.tika.parser.mat.MatParserTest`.


Hope that helps,
Tyler


On Wed, Jun 4, 2014 at 1:27 AM, Matthias Krueger <[email protected]> wrote:

>
> Hi Annie,
>
>
>  [INFO] -------------------------------------------------------------
>> [ERROR] COMPILATION ERROR :
>> [INFO] -------------------------------------------------------------
>> [ERROR]
>> /Users/annbryant/TIKA/tika/tika-parsers/src/main/java/
>> org/apache/tika/parser/mat/MatParser.java:[69,23]
>> cannot find symbol
>> symbol  : constructor
>> MatFileReader(org.apache.tika.io.CloseShieldInputStream)
>> location: class com.jmatio.io.MatFileReader
>>
>
> You're trying to create a com.jmatio.io.MatFileReader passing an
> InputStream while the constructor needs something else (a java.io.File).
> TikaInputStream supports spooling InputStreams to a temp file so this might
> be the way to go. Have a look at org.apache.tika.parser.jpeg.JpegParser#parse
> and how it passes a File reference to the ImageMetadataExtractor.
>
> Cheers
> Matthias
>

Index: tika-parsers/pom.xml
===================================================================
--- tika-parsers/pom.xml        (revision 1600365)
+++ tika-parsers/pom.xml        (working copy)
@@ -103,10 +103,16 @@
       <artifactId>pdfbox</artifactId>
       <version>${pdfbox.version}</version>
     </dependency>
-    <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
-         as optional, but we prefer to have them always to avoid
-         problems with encrypted PDFs. -->
     <dependency>
+      <groupId>net.sourceforge.jmatio</groupId>
+      <artifactId>jmatio</artifactId>
+      <version>1.0</version>
+    </dependency>
+
+      <!-- TIKA-370: PDFBox declares the Bouncy Castle dependencies
+           as optional, but we prefer to have them always to avoid
+           problems with encrypted PDFs. -->
+    <dependency>
       <groupId>org.bouncycastle</groupId>
       <artifactId>bcmail-jdk15</artifactId>
       <version>1.45</version>
Index: tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java
===================================================================
--- tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java        
(revision 0)
+++ tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java        
(working copy)
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mat;
+
+//JDK imports
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Collection;
+import java.util.Set;
+import java.util.Map;
+import java.util.Iterator;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+//JMatIO imports
+import com.jmatio.io.MatFileHeader;
+import com.jmatio.io.MatFileReader;
+import com.jmatio.types.MLArray;
+import com.jmatio.types.MLStructure;
+
+public class MatParser extends AbstractParser {
+
+    public static final String MATLAB_MIME_TYPE =
+            "application/matlab.mat";
+
+    private final Set<MediaType> SUPPORTED_TYPES =
+            Collections.singleton(MediaType.application("matlab.mat"));
+
+    public Set<MediaType> getSupportedTypes(ParseContext context){
+        return SUPPORTED_TYPES;
+    }
+
+    public void parse(InputStream stream, ContentHandler handler,
+                      Metadata metadata, ParseContext context) throws 
IOException,
+            SAXException, TikaException {
+
+        //Set MIME type as metadata
+        metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
+
+        try {
+            System.err.println("I'm actually parsing, promise!");
+            //Extract information from header file
+            TikaInputStream tis = TikaInputStream.get(stream);
+            System.err.println("Before the MFR, after TIS.");
+            MatFileReader mfr = new MatFileReader(tis.getFile()); //input .mat 
file
+            System.err.println("Made it past the MFR!!");
+            MatFileHeader hdr = mfr.getMatFileHeader(); //.mat header 
information
+            System.err.println("Made it past the parsing into MFR and MFH!");
+
+            String stringToSplit = hdr.getDescription(); //break header 
information into its parts
+            String[] parts = stringToSplit.split(",");
+
+            // Ex .mat header "MATLAB 5.0 MAT-file, Platform: MACI64, Created 
on: Sun Mar  2 23:41:57 2014"
+            if (parts[2].contains("Created")) {
+                int lastIndex1 = parts[2].lastIndexOf("Created on:");
+                String dateCreated = parts[2].substring(lastIndex1 + 
11).trim();
+                metadata.set("createdOn", dateCreated);
+            }
+
+            if (parts[1].contains("Platform")) {
+                int lastIndex2 = parts[1].lastIndexOf("Platform:");
+                String platform = parts[1].substring(lastIndex2 + 9).trim();
+                metadata.set("platform" , platform);
+            }
+
+            if (parts[0].contains("MATLAB")) {
+                metadata.set("fileType", parts[0]);
+            }
+
+            //Get endian indicator from header file
+            String endianBytes = new String(hdr.getEndianIndicator()); 
//retrieve endian bytes and convert to string
+            String endianCode = String.valueOf(endianBytes.toCharArray()); 
//convert bytes to characters to string
+            metadata.set("endian", endianCode);
+
+            //Text output
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+            xhtml.startDocument();
+
+            //Get array names, size, and data types
+            Map<String, MLArray> data = mfr.getContent();
+            Set<String> vars = data.keySet();
+
+            //Loop through each variable
+            Iterator<String> var = vars.iterator();
+            while (var.hasNext()) {
+                String varName = var.next();
+                MLArray varData = data.get(varName);
+
+                xhtml.characters(varName);
+                xhtml.characters(":");
+                xhtml.characters(String.valueOf(varData));
+                xhtml.newline();
+
+                //if the variable is a structure, extract variable info from 
structure
+                if (varData.isStruct()){
+                    MLStructure mlStructure = (MLStructure) 
mfr.getMLArray(varName);
+                    Collection<MLArray> list = mlStructure.getAllFields();
+
+                    for (MLArray element : list){
+                        xhtml.characters("  ");
+                        xhtml.characters(String.valueOf(element));
+                        xhtml.newline();
+
+                        //if there is an imbedded structure, extract variable 
info.
+                        if (element.isStruct()){
+                            String nest = element.contentToString();
+                            xhtml.characters("      ");
+                            xhtml.characters(nest);
+                            xhtml.newline();
+                        }
+                    }
+                }
+            }
+
+            xhtml.endDocument();
+
+        } catch (IOException e) {
+            throw new TikaException("matparser error", e);
+        }
+
+    }
+
+}
\ No newline at end of file
Index: tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
===================================================================
--- tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java    
(revision 0)
+++ tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java    
(working copy)
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+//package org.apache.tika.parser.mat;
+package org.apache.tika.parser.mat;
+
+
+//JDK imports
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.InputStream;
+
+//TIKA imports
+import org.apache.tika.io.CloseShieldInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+/**
+ * Test cases to exercise the {@link MatParser}.
+ *
+ */
+public class MatParserTest {
+
+    @Test
+    public void testParseGlobalMetadata() throws Exception {
+        if(System.getProperty("java.version").startsWith("1.5")) {
+            return;
+        }
+
+        Parser parser = new MatParser();
+        ContentHandler handler = new BodyContentHandler();
+        Metadata metadata = new Metadata();
+        InputStream stream = 
getClass().getResourceAsStream("/test-documents/MatlabFile.m");
+
+        try {
+            parser.parse(stream, handler, metadata, new ParseContext());
+        } finally {
+            stream.close();
+        }
+
+        /*assertEquals(metadata.get(Metadata.platform), "PCWIN64");
+        assertEquals(metadata.get(Metadata.PROJECT_ID),
+                "IPCC Fourth Assessment");
+
+                
+        String content = handler.toString();
+               assertTrue(content.contains(":long_name = \"Surface 
area\";"));*/
+
+
+
+    }
+
+}

Re: unit test error for new parser

Reply via email to