Author: jukka
Date: Sat Dec 28 01:39:08 2013
New Revision: 1553779

URL: http://svn.apache.org/r1553779
Log:
TIKA-1160: Add support for SolidWorks files

Patch and test files by Gunter Rombauts

Added:
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2013SP2.SLDASM
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2014SP0.SLDASM
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2013SP2.SLDDRW
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2014SP0.SLDDRW
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2013SP2.SLDPRT
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2014SP0.SLDPRT
Modified:
    tika/trunk/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java?rev=1553779&r1=1553778&r2=1553779&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java 
(original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java 
Sat Dec 28 01:39:08 2013
@@ -129,6 +129,9 @@ public class TikaDetectionTest {
         assertEquals("application/sdp", tika.detect("x.sdp"));
         assertEquals("application/set-payment-initiation", 
tika.detect("x.setpay"));
         assertEquals("application/set-registration-initiation", 
tika.detect("x.setreg"));
+        assertEquals("application/sldworks", tika.detect("x.sldprt"));
+        assertEquals("application/sldworks", tika.detect("x.sldasm"));
+        assertEquals("application/sldworks", tika.detect("x.slddrw"));
         assertEquals("application/shf+xml", tika.detect("x.shf"));
         assertEquals("application/smil+xml", tika.detect("x.smi"));
         assertEquals("application/smil+xml", tika.detect("x.smil"));

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1553779&r1=1553778&r2=1553779&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
 Sat Dec 28 01:39:08 2013
@@ -71,7 +71,10 @@ public class OfficeParser extends Abstra
                     POIFSDocumentType.VISIO.type,
                     // Works isn't supported
                     POIFSDocumentType.XLR.type, // but Works 7.0 Spreadsheet is
-                    POIFSDocumentType.OUTLOOK.type
+                    POIFSDocumentType.OUTLOOK.type,
+                    POIFSDocumentType.SOLIDWORKS_PART.type,
+                    POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type,
+                    POIFSDocumentType.SOLIDWORKS_DRAWING.type
                     )));
 
     public enum POIFSDocumentType {
@@ -87,7 +90,10 @@ public class OfficeParser extends Abstra
         VISIO("vsd", MediaType.application("vnd.visio")),
         WORKS("wps", MediaType.application("vnd.ms-works")),
         XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")),
-        OUTLOOK("msg", MediaType.application("vnd.ms-outlook"));
+        OUTLOOK("msg", MediaType.application("vnd.ms-outlook")),
+        SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")),
+        SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")),
+        SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks"));
 
         private final String extension;
         private final MediaType type;
@@ -177,6 +183,13 @@ public class OfficeParser extends Abstra
         }
 
         switch (type) {
+        case SOLIDWORKS_PART:
+//             new SolidworksExtractor(context).parse(root, xhtml);
+               break;
+        case SOLIDWORKS_ASSEMBLY:
+               break;
+        case SOLIDWORKS_DRAWING:
+               break;
         case PUBLISHER:
            PublisherTextExtractor publisherTextExtractor =
               new PublisherTextExtractor(root);

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1553779&r1=1553778&r2=1553779&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
 Sat Dec 28 01:39:08 2013
@@ -120,6 +120,9 @@ public class POIFSContainerDetector impl
     /** StarOffice Writer */
     public static final MediaType SDW = application("vnd.stardivision.writer");
 
+    /** SolidWorks CAD file */
+    public static final MediaType SLDWORKS = application("sldworks");
+
     /** Regexp for matching the MPP Project Data stream */
     private static final Pattern mppDataMatch = 
Pattern.compile("\\s\\s\\s\\d+");
 
@@ -199,7 +202,9 @@ public class POIFSContainerDetector impl
      */
     protected static MediaType detect(Set<String> names, DirectoryEntry root) {
         if (names != null) {
-            if (names.contains("StarCalcDocument")) {
+            if (names.contains("SwDocContentMgr") && 
names.contains("SwDocMgrTempStorage")) {
+                return SLDWORKS;
+            } else if (names.contains("StarCalcDocument")) {
                 // Star Office Calc
                 return SDC;
             } else if (names.contains("StarWriterDocument")) {

Added: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java?rev=1553779&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java
 (added)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java
 Sat Dec 28 01:39:08 2013
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.solidworks;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class SolidworksParserTest extends TikaTest {
+
+    /**
+     * Test the parsing of an solidWorks part in version 2013SP2
+     */
+    @Test
+    public void testPart2013SP2Parser() throws Exception {
+        InputStream input = SolidworksParserTest.class.getResourceAsStream(
+                "/test-documents/testsolidworksPart2013SP2.SLDPRT");
+        try {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            //Check content type
+            
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+             
+            //Check properties
+            assertEquals("2012-04-18T10:27:29Z", 
metadata.get(TikaCoreProperties.CREATED));
+            assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+            assertEquals("2013-09-06T08:12:12Z", 
metadata.get(Metadata.MODIFIED));
+            assertEquals("solidworks-dcom_dev", 
metadata.get(TikaCoreProperties.MODIFIER));
+            assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+            assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+            assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+            assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+        } finally {
+            input.close();
+        }
+    }
+
+    /**
+     * Test the parsing of an solidWorks part in version 2014SP0
+     */
+    @Test
+    public void testPart2014SP0Parser() throws Exception {
+        InputStream input = SolidworksParserTest.class.getResourceAsStream(
+                "/test-documents/testsolidworksPart2014SP0.SLDPRT");
+        try {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            //Check content type
+            
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+            
+            //Check properties
+            assertEquals("2012-04-18T10:27:29Z", 
metadata.get(TikaCoreProperties.CREATED));
+            assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+            assertEquals("2013-11-28T12:38:28Z", 
metadata.get(Metadata.MODIFIED));
+            assertEquals("solidworks-dcom_dev", 
metadata.get(TikaCoreProperties.MODIFIER));
+            assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+            assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+            assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+            assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+        } finally {
+            input.close();
+        }
+    }
+
+    /**
+     * Test the parsing of an solidWorks assembly in version 2013SP2
+     */
+    @Test
+    public void testAssembly2013SP2Parser() throws Exception {
+        InputStream input = SolidworksParserTest.class.getResourceAsStream(
+                "/test-documents/testsolidworksAssembly2013SP2.SLDASM");
+        try {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            //Check content type
+            
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+            
+            //Check properties
+            assertEquals("2012-04-25T09:51:38Z", 
metadata.get(TikaCoreProperties.CREATED));
+            assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+            assertEquals("2013-09-06T08:11:08Z", 
metadata.get(Metadata.MODIFIED));
+            assertEquals("solidworks-dcom_dev", 
metadata.get(TikaCoreProperties.MODIFIER));
+            assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+            assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+            assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+            assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+        } finally {
+            input.close();
+        }      
+    }
+
+    /**
+     * Test the parsing of an solidWorks assembly in version 2014SP0
+     */
+    @Test
+    public void testAssembly2014SP0Parser() throws Exception {
+        InputStream input = SolidworksParserTest.class.getResourceAsStream(
+                "/test-documents/testsolidworksAssembly2014SP0.SLDASM");
+        try {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            //Check content type
+            
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+            
+            //Check properties
+            assertEquals("2012-04-25T09:51:38Z", 
metadata.get(TikaCoreProperties.CREATED));
+            assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+            assertEquals("2013-11-28T12:41:49Z", 
metadata.get(Metadata.MODIFIED));
+            assertEquals("solidworks-dcom_dev", 
metadata.get(TikaCoreProperties.MODIFIER));
+            assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+            assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+            assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+            assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+        } finally {
+            input.close();
+        }      
+    }
+
+    /*
+     * Test the parsing of an solidWorks drawing in version 2013SP2
+     */
+    @Test
+    public void testDrawing2013SP2Parser() throws Exception {
+        InputStream input = SolidworksParserTest.class.getResourceAsStream(
+                "/test-documents/testsolidworksDrawing2013SP2.SLDDRW");
+        try {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            //Check content type
+            
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+            
+            //Check properties
+            assertEquals("2012-07-03T12:05:29Z", 
metadata.get(TikaCoreProperties.CREATED));
+            assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+            assertEquals("2013-09-06T08:06:57Z", 
metadata.get(Metadata.MODIFIED));
+            assertEquals("solidworks-dcom_dev", 
metadata.get(TikaCoreProperties.MODIFIER));
+            assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+            assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+            assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+            assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+        } finally {
+            input.close();
+        }      
+    }
+
+    /**
+     * Test the parsing of an solidWorks drawing in version 2014SP0
+     */
+    @Test
+    public void testDrawing2014SP0Parser() throws Exception {
+        InputStream input = SolidworksParserTest.class.getResourceAsStream(
+                "/test-documents/testsolidworksDrawing2014SP0.SLDDRW");
+        try {
+            ContentHandler handler = new BodyContentHandler();
+            Metadata metadata = new Metadata();
+            new OfficeParser().parse(input, handler, metadata, new 
ParseContext());
+
+            //Check content type
+            
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+            
+            //Check properties
+            assertEquals("2012-07-03T12:05:29Z", 
metadata.get(TikaCoreProperties.CREATED));
+            assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+            assertEquals("2013-11-28T12:41:49Z", 
metadata.get(Metadata.MODIFIED));
+            assertEquals("solidworks-dcom_dev", 
metadata.get(TikaCoreProperties.MODIFIER));
+            assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+            assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+            assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+            assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+        } finally {
+            input.close();
+        }      
+    }
+}

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2013SP2.SLDASM
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2013SP2.SLDASM?rev=1553779&view=auto
==============================================================================
Files 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2013SP2.SLDASM
 (added) and 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2013SP2.SLDASM
 Sat Dec 28 01:39:08 2013 differ

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2014SP0.SLDASM
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2014SP0.SLDASM?rev=1553779&view=auto
==============================================================================
Files 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2014SP0.SLDASM
 (added) and 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2014SP0.SLDASM
 Sat Dec 28 01:39:08 2013 differ

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2013SP2.SLDDRW
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2013SP2.SLDDRW?rev=1553779&view=auto
==============================================================================
Files 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2013SP2.SLDDRW
 (added) and 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2013SP2.SLDDRW
 Sat Dec 28 01:39:08 2013 differ

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2014SP0.SLDDRW
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2014SP0.SLDDRW?rev=1553779&view=auto
==============================================================================
Files 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2014SP0.SLDDRW
 (added) and 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2014SP0.SLDDRW
 Sat Dec 28 01:39:08 2013 differ

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2013SP2.SLDPRT
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2013SP2.SLDPRT?rev=1553779&view=auto
==============================================================================
Files 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2013SP2.SLDPRT
 (added) and 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2013SP2.SLDPRT
 Sat Dec 28 01:39:08 2013 differ

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2014SP0.SLDPRT
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2014SP0.SLDPRT?rev=1553779&view=auto
==============================================================================
Files 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2014SP0.SLDPRT
 (added) and 
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2014SP0.SLDPRT
 Sat Dec 28 01:39:08 2013 differ


Reply via email to