Author: jukka
Date: Sat Dec 28 01:39:08 2013
New Revision: 1553779
URL: http://svn.apache.org/r1553779
Log:
TIKA-1160: Add support for SolidWorks files
Patch and test files by Gunter Rombauts
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2013SP2.SLDASM
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2014SP0.SLDASM
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2013SP2.SLDDRW
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2014SP0.SLDDRW
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2013SP2.SLDPRT
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2014SP0.SLDPRT
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
Modified:
tika/trunk/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java?rev=1553779&r1=1553778&r2=1553779&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
(original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
Sat Dec 28 01:39:08 2013
@@ -129,6 +129,9 @@ public class TikaDetectionTest {
assertEquals("application/sdp", tika.detect("x.sdp"));
assertEquals("application/set-payment-initiation",
tika.detect("x.setpay"));
assertEquals("application/set-registration-initiation",
tika.detect("x.setreg"));
+ assertEquals("application/sldworks", tika.detect("x.sldprt"));
+ assertEquals("application/sldworks", tika.detect("x.sldasm"));
+ assertEquals("application/sldworks", tika.detect("x.slddrw"));
assertEquals("application/shf+xml", tika.detect("x.shf"));
assertEquals("application/smil+xml", tika.detect("x.smi"));
assertEquals("application/smil+xml", tika.detect("x.smil"));
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java?rev=1553779&r1=1553778&r2=1553779&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
Sat Dec 28 01:39:08 2013
@@ -71,7 +71,10 @@ public class OfficeParser extends Abstra
POIFSDocumentType.VISIO.type,
// Works isn't supported
POIFSDocumentType.XLR.type, // but Works 7.0 Spreadsheet is
- POIFSDocumentType.OUTLOOK.type
+ POIFSDocumentType.OUTLOOK.type,
+ POIFSDocumentType.SOLIDWORKS_PART.type,
+ POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type,
+ POIFSDocumentType.SOLIDWORKS_DRAWING.type
)));
public enum POIFSDocumentType {
@@ -87,7 +90,10 @@ public class OfficeParser extends Abstra
VISIO("vsd", MediaType.application("vnd.visio")),
WORKS("wps", MediaType.application("vnd.ms-works")),
XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")),
- OUTLOOK("msg", MediaType.application("vnd.ms-outlook"));
+ OUTLOOK("msg", MediaType.application("vnd.ms-outlook")),
+ SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")),
+ SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")),
+ SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks"));
private final String extension;
private final MediaType type;
@@ -177,6 +183,13 @@ public class OfficeParser extends Abstra
}
switch (type) {
+ case SOLIDWORKS_PART:
+// new SolidworksExtractor(context).parse(root, xhtml);
+ break;
+ case SOLIDWORKS_ASSEMBLY:
+ break;
+ case SOLIDWORKS_DRAWING:
+ break;
case PUBLISHER:
PublisherTextExtractor publisherTextExtractor =
new PublisherTextExtractor(root);
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java?rev=1553779&r1=1553778&r2=1553779&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
Sat Dec 28 01:39:08 2013
@@ -120,6 +120,9 @@ public class POIFSContainerDetector impl
/** StarOffice Writer */
public static final MediaType SDW = application("vnd.stardivision.writer");
+ /** SolidWorks CAD file */
+ public static final MediaType SLDWORKS = application("sldworks");
+
/** Regexp for matching the MPP Project Data stream */
private static final Pattern mppDataMatch =
Pattern.compile("\\s\\s\\s\\d+");
@@ -199,7 +202,9 @@ public class POIFSContainerDetector impl
*/
protected static MediaType detect(Set<String> names, DirectoryEntry root) {
if (names != null) {
- if (names.contains("StarCalcDocument")) {
+ if (names.contains("SwDocContentMgr") &&
names.contains("SwDocMgrTempStorage")) {
+ return SLDWORKS;
+ } else if (names.contains("StarCalcDocument")) {
// Star Office Calc
return SDC;
} else if (names.contains("StarWriterDocument")) {
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java?rev=1553779&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java
(added)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/solidworks/SolidworksParserTest.java
Sat Dec 28 01:39:08 2013
@@ -0,0 +1,213 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.solidworks;
+
+import static junit.framework.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.OfficeParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class SolidworksParserTest extends TikaTest {
+
+ /**
+ * Test the parsing of an solidWorks part in version 2013SP2
+ */
+ @Test
+ public void testPart2013SP2Parser() throws Exception {
+ InputStream input = SolidworksParserTest.class.getResourceAsStream(
+ "/test-documents/testsolidworksPart2013SP2.SLDPRT");
+ try {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new OfficeParser().parse(input, handler, metadata, new
ParseContext());
+
+ //Check content type
+
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+
+ //Check properties
+ assertEquals("2012-04-18T10:27:29Z",
metadata.get(TikaCoreProperties.CREATED));
+ assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+ assertEquals("2013-09-06T08:12:12Z",
metadata.get(Metadata.MODIFIED));
+ assertEquals("solidworks-dcom_dev",
metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+ assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+ assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+ assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+ } finally {
+ input.close();
+ }
+ }
+
+ /**
+ * Test the parsing of an solidWorks part in version 2014SP0
+ */
+ @Test
+ public void testPart2014SP0Parser() throws Exception {
+ InputStream input = SolidworksParserTest.class.getResourceAsStream(
+ "/test-documents/testsolidworksPart2014SP0.SLDPRT");
+ try {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new OfficeParser().parse(input, handler, metadata, new
ParseContext());
+
+ //Check content type
+
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+
+ //Check properties
+ assertEquals("2012-04-18T10:27:29Z",
metadata.get(TikaCoreProperties.CREATED));
+ assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+ assertEquals("2013-11-28T12:38:28Z",
metadata.get(Metadata.MODIFIED));
+ assertEquals("solidworks-dcom_dev",
metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+ assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+ assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+ assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+ } finally {
+ input.close();
+ }
+ }
+
+ /**
+ * Test the parsing of an solidWorks assembly in version 2013SP2
+ */
+ @Test
+ public void testAssembly2013SP2Parser() throws Exception {
+ InputStream input = SolidworksParserTest.class.getResourceAsStream(
+ "/test-documents/testsolidworksAssembly2013SP2.SLDASM");
+ try {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new OfficeParser().parse(input, handler, metadata, new
ParseContext());
+
+ //Check content type
+
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+
+ //Check properties
+ assertEquals("2012-04-25T09:51:38Z",
metadata.get(TikaCoreProperties.CREATED));
+ assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+ assertEquals("2013-09-06T08:11:08Z",
metadata.get(Metadata.MODIFIED));
+ assertEquals("solidworks-dcom_dev",
metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+ assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+ assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+ assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+ } finally {
+ input.close();
+ }
+ }
+
+ /**
+ * Test the parsing of an solidWorks assembly in version 2014SP0
+ */
+ @Test
+ public void testAssembly2014SP0Parser() throws Exception {
+ InputStream input = SolidworksParserTest.class.getResourceAsStream(
+ "/test-documents/testsolidworksAssembly2014SP0.SLDASM");
+ try {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new OfficeParser().parse(input, handler, metadata, new
ParseContext());
+
+ //Check content type
+
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+
+ //Check properties
+ assertEquals("2012-04-25T09:51:38Z",
metadata.get(TikaCoreProperties.CREATED));
+ assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+ assertEquals("2013-11-28T12:41:49Z",
metadata.get(Metadata.MODIFIED));
+ assertEquals("solidworks-dcom_dev",
metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+ assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+ assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+ assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+ } finally {
+ input.close();
+ }
+ }
+
+ /*
+ * Test the parsing of an solidWorks drawing in version 2013SP2
+ */
+ @Test
+ public void testDrawing2013SP2Parser() throws Exception {
+ InputStream input = SolidworksParserTest.class.getResourceAsStream(
+ "/test-documents/testsolidworksDrawing2013SP2.SLDDRW");
+ try {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new OfficeParser().parse(input, handler, metadata, new
ParseContext());
+
+ //Check content type
+
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+
+ //Check properties
+ assertEquals("2012-07-03T12:05:29Z",
metadata.get(TikaCoreProperties.CREATED));
+ assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+ assertEquals("2013-09-06T08:06:57Z",
metadata.get(Metadata.MODIFIED));
+ assertEquals("solidworks-dcom_dev",
metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+ assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+ assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+ assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+ } finally {
+ input.close();
+ }
+ }
+
+ /**
+ * Test the parsing of an solidWorks drawing in version 2014SP0
+ */
+ @Test
+ public void testDrawing2014SP0Parser() throws Exception {
+ InputStream input = SolidworksParserTest.class.getResourceAsStream(
+ "/test-documents/testsolidworksDrawing2014SP0.SLDDRW");
+ try {
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+ new OfficeParser().parse(input, handler, metadata, new
ParseContext());
+
+ //Check content type
+
assertEquals("application/sldworks",metadata.get(Metadata.CONTENT_TYPE));
+
+ //Check properties
+ assertEquals("2012-07-03T12:05:29Z",
metadata.get(TikaCoreProperties.CREATED));
+ assertEquals(null, metadata.get(TikaCoreProperties.CONTRIBUTOR));
+ assertEquals("2013-11-28T12:41:49Z",
metadata.get(Metadata.MODIFIED));
+ assertEquals("solidworks-dcom_dev",
metadata.get(TikaCoreProperties.MODIFIER));
+ assertEquals(null, metadata.get(TikaCoreProperties.RELATION));
+ assertEquals(null, metadata.get(TikaCoreProperties.RIGHTS));
+ assertEquals(null, metadata.get(TikaCoreProperties.SOURCE));
+ assertEquals("", metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("", metadata.get(TikaCoreProperties.KEYWORDS));
+ } finally {
+ input.close();
+ }
+ }
+}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2013SP2.SLDASM
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2013SP2.SLDASM?rev=1553779&view=auto
==============================================================================
Files
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2013SP2.SLDASM
(added) and
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2013SP2.SLDASM
Sat Dec 28 01:39:08 2013 differ
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2014SP0.SLDASM
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2014SP0.SLDASM?rev=1553779&view=auto
==============================================================================
Files
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2014SP0.SLDASM
(added) and
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksAssembly2014SP0.SLDASM
Sat Dec 28 01:39:08 2013 differ
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2013SP2.SLDDRW
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2013SP2.SLDDRW?rev=1553779&view=auto
==============================================================================
Files
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2013SP2.SLDDRW
(added) and
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2013SP2.SLDDRW
Sat Dec 28 01:39:08 2013 differ
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2014SP0.SLDDRW
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2014SP0.SLDDRW?rev=1553779&view=auto
==============================================================================
Files
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2014SP0.SLDDRW
(added) and
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksDrawing2014SP0.SLDDRW
Sat Dec 28 01:39:08 2013 differ
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2013SP2.SLDPRT
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2013SP2.SLDPRT?rev=1553779&view=auto
==============================================================================
Files
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2013SP2.SLDPRT
(added) and
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2013SP2.SLDPRT
Sat Dec 28 01:39:08 2013 differ
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2014SP0.SLDPRT
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2014SP0.SLDPRT?rev=1553779&view=auto
==============================================================================
Files
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2014SP0.SLDPRT
(added) and
tika/trunk/tika-parsers/src/test/resources/test-documents/testsolidworksPart2014SP0.SLDPRT
Sat Dec 28 01:39:08 2013 differ