Author: nick
Date: Mon Jul 14 20:05:13 2014
New Revision: 1610506
URL: http://svn.apache.org/r1610506
Log:
Patch from Tyler Palsulich from TIKA-1327 - More enhancements to the Matlab
parser
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/test_mat_text.mat
(with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java?rev=1610506&r1=1610505&r2=1610506&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/mat/MatParser.java
Mon Jul 14 20:05:13 2014
@@ -48,101 +48,91 @@ import com.jmatio.common.MatDataTypes;
public class MatParser extends AbstractParser {
-
- public static final String MATLAB_MIME_TYPE =
- "application/x-matlab-data";
+
+ public static final String MATLAB_MIME_TYPE =
+ "application/x-matlab-data";
private final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.application("x-matlab-data"));
+ Collections.singleton(MediaType.application("x-matlab-data"));
public Set<MediaType> getSupportedTypes(ParseContext context){
return SUPPORTED_TYPES;
- }
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context) throws IOException,
- SAXException, TikaException {
-
- //Set MIME type as metadata
+ //Set MIME type as Matlab
metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
-
+
try {
-
- //Input file stream
+ // Use TIS so we can spool a temp file for parsing.
TikaInputStream tis = TikaInputStream.get(stream);
-
+
//Extract information from header file
MatFileReader mfr = new MatFileReader(tis.getFile()); //input .mat
file
MatFileHeader hdr = mfr.getMatFileHeader(); //.mat header
information
- String stringToSplit = hdr.getDescription(); //break header
information into its parts
- String[] parts = stringToSplit.split(",");
-
- // Ex .mat header "MATLAB 5.0 MAT-file, Platform: MACI64, Created
on: Sun Mar 2 23:41:57 2014"
+ // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created
on: Sun Mar 2 23:41:57 2014"
+ String[] parts = hdr.getDescription().split(","); // Break header
information into its parts
+
if (parts[2].contains("Created")) {
int lastIndex1 = parts[2].lastIndexOf("Created on:");
- String dateCreated = parts[2].substring(lastIndex1 +
11).trim();
+ String dateCreated = parts[2].substring(lastIndex1 + "Created
on:".length()).trim();
metadata.set("createdOn", dateCreated);
- }
-
+ }
+
if (parts[1].contains("Platform")) {
int lastIndex2 = parts[1].lastIndexOf("Platform:");
- String platform = parts[1].substring(lastIndex2 + 9).trim();
+ String platform = parts[1].substring(lastIndex2 +
"Platform:".length()).trim();
metadata.set("platform" , platform);
- }
-
- if (parts[0].contains("MATLAB")) {
+ }
+
+ if (parts[0].contains("MATLAB")) {
metadata.set("fileType", parts[0]);
- }
-
- //Get endian indicator from header file
- String endianBytes = new String(hdr.getEndianIndicator());
//retrieve endian bytes and convert to string
- String endianCode = String.valueOf(endianBytes.toCharArray());
//convert bytes to characters to string
+ }
+
+ // Get endian indicator from header file
+ String endianBytes = new String(hdr.getEndianIndicator()); //
Retrieve endian bytes and convert to string
+ String endianCode = String.valueOf(endianBytes.toCharArray()); //
Convert bytes to characters to string
metadata.set("endian", endianCode);
-
+
//Text output
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
xhtml.startDocument();
-
- //Get array names, size, and data types
- Map<String, MLArray> data = mfr.getContent();
- Set<String> vars = data.keySet();
-
- //Loop through each variable
- for (Iterator<String> var = vars.iterator(); var.hasNext();) {
- String varName = var.next();
- MLArray varData = data.get(varName);
-
- xhtml.characters(varName);
- xhtml.characters(":");
- xhtml.characters(String.valueOf(varData));
- xhtml.newline();
-
- //if the variable is a structure, extract variable info from
structure
+ xhtml.newline();
+ //Loop through each variable
+ for (Map.Entry<String, MLArray> entry :
mfr.getContent().entrySet()) {
+ String varName = entry.getKey();
+ MLArray varData = entry.getValue();
+
+ xhtml.element("p", varName + ":" + String.valueOf(varData));
+
+ // If the variable is a structure, extract variable info from
structure
if (varData.isStruct()){
- MLStructure mlStructure = (MLStructure)
mfr.getMLArray(varName);
- Collection<MLArray> list = mlStructure.getAllFields();
-
- for (MLArray element : list){
- xhtml.characters(" ");
+ MLStructure mlStructure = (MLStructure)
mfr.getMLArray(varName);
+ xhtml.startElement("ul");
+ xhtml.newline();
+ for (MLArray element : mlStructure.getAllFields()){
+ xhtml.startElement("li");
xhtml.characters(String.valueOf(element));
- xhtml.newline();
-
- //if there is an imbedded structure, extract variable
info.
- if (element.isStruct()){
- String nest = element.contentToString();
- xhtml.characters(" ");
- xhtml.characters(nest);
- xhtml.newline();
- }
+
+ // If there is an embedded structure, extract variable
info.
+ if (element.isStruct()){
+ xhtml.startElement("ul");
+ // Should this actually be a recursive call?
+ xhtml.element("li", element.contentToString());
+ xhtml.endElement("ul");
}
- }
- }
- xhtml.endDocument();
-
- } catch (IOException e) {
- throw new TikaException("matparser error", e);
- }
- }
+ xhtml.endElement("li");
+ }
+ xhtml.endElement("ul");
+ }
+ }
+ xhtml.endDocument();
+ } catch (IOException e) {
+ throw new TikaException("Error parsing Matlab file with
MatParser", e);
+ }
+ }
}
\ No newline at end of file
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java?rev=1610506&r1=1610505&r2=1610506&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/mat/MatParserTest.java
Mon Jul 14 20:05:13 2014
@@ -31,47 +31,70 @@ import org.apache.tika.metadata.TikaCore
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
/**
* Test cases to exercise the {@link MatParser}.
- *
+ *
*/
public class MatParserTest {
@Test
public void testParser() throws Exception {
-
+
Parser parser = new MatParser();
- ContentHandler handler = new BodyContentHandler();
+ ToXMLContentHandler handler = new ToXMLContentHandler();
Metadata metadata = new Metadata();
String path =
"/test-documents/breidamerkurjokull_radar_profiles_2009.mat";
- InputStream stream = MatParser.class.getResourceAsStream(path);
-
+ InputStream stream = MatParser.class.getResourceAsStream(path);
+
try {
parser.parse(stream, handler, metadata, new ParseContext());
- } finally {
- stream.close();
- }
-
- //Check Metadata
- assertEquals("PCWIN64", metadata.get("platform"));
- assertEquals("MATLAB 5.0 MAT-file", metadata.get("fileType"));
- assertEquals("IM", metadata.get("endian"));
- assertEquals("Thu Feb 21 15:52:49 2013",
metadata.get("createdOn"));
-
- //Check Content
- String content = handler.toString();
-
- assertTrue(content.contains("a1:[1x1 struct array]"));
- assertTrue(content.contains("[1024x1261 double
array]"));
- assertTrue(content.contains("a2:[1x1 struct array]"));
- assertTrue(content.contains("[1024x1283 double
array]"));
- assertTrue(content.contains("b1:[1x1 struct array]"));
- assertTrue(content.contains("[1024x1311 double
array]"));
- assertTrue(content.contains("c1:[1x1 struct array]"));
- assertTrue(content.contains("[1024x909 double
array]"));
- }
+ } finally {
+ stream.close();
+ }
+
+ //Check Metadata
+ assertEquals("PCWIN64", metadata.get("platform"));
+ assertEquals("MATLAB 5.0 MAT-file", metadata.get("fileType"));
+ assertEquals("IM", metadata.get("endian"));
+ assertEquals("Thu Feb 21 15:52:49 2013", metadata.get("createdOn"));
+
+ //Check Content
+ String content = handler.toString();
+
+ assertTrue(content.contains("<li>[1x909 double array]</li>"));
+ assertTrue(content.contains("<p>c1:[1x1 struct array]</p>"));
+ assertTrue(content.contains("<li>[1024x1 double array]</li>"));
+ assertTrue(content.contains("<p>b1:[1x1 struct array]</p>"));
+ assertTrue(content.contains("<p>a1:[1x1 struct array]</p>"));
+ assertTrue(content.contains("<li>[1024x1261 double array]</li>"));
+ assertTrue(content.contains("<li>[1x1 double array]</li>"));
+ assertTrue(content.contains("</body></html>"));
+ }
+
+ @Test
+ public void testParserForText() throws Exception {
+
+ Parser parser = new MatParser();
+ ToXMLContentHandler handler = new ToXMLContentHandler();
+ Metadata metadata = new Metadata();
+ String path = "/test-documents/test_mat_text.mat";
+
+ InputStream stream = MatParser.class.getResourceAsStream(path);
+
+ try {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ } finally {
+ stream.close();
+ }
+
+ //Check Content
+ String content = handler.toString();
+ assertTrue(content.contains("<p>double:[2x2 double array]</p>"));
+ System.err.println(content);
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/test_mat_text.mat
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/test_mat_text.mat?rev=1610506&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/test_mat_text.mat
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream