This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 75d88149b TIKA-1735 - Adding DWGRead parser to Tika if available (#558)
75d88149b is described below
commit 75d88149bc479d79b02c986e86333be11f697692
Author: Dan Coldrick <[email protected]>
AuthorDate: Thu Oct 6 16:38:50 2022 +0100
TIKA-1735 - Adding DWGRead parser to Tika if available (#558)
* Initial commit for review.
* Resolving tmpFileOutCleaned to make sure it is deleted even if we
encounter an exception
* Added ProcessUtils.execute instead of calling process directly and fixed
checkstyle
* Added Summary Info from Nicholas DiPiazza
* Added tests back in, added initialize for dwgread to check exists
* Attempt at cleaning up the DWG strings with regex
* Fixed some of the broken regexes
* Cleaned up code for checkstyle
* Fixed Nan as was replacing with "" instead of 0
* Added buffer reader and added new regexes
* Amended Regexes, fixed config default, fixed Julian Date
* Update DWGParser.java
* Fixed DWGParser
* Fixed DWGParser Test
* Fixed Configs
* Added new classs for cleaning up format
* Check Style fixes
* Added Tests and fixed DWGReadFormatRemover
* Added Tests and fixed DWGReadFormatRemover
Check Style fixes
* Fixed CheckStyle Issues
* Added Timeout test
* Added Timeout test
Co-authored-by: monkm <[email protected]>
---
.../tika-parser-cad-module/pom.xml | 11 +
.../apache/tika/parser/dwg/AbstractDWGParser.java | 93 ++++++
.../java/org/apache/tika/parser/dwg/DWGParser.java | 151 +++++-----
.../apache/tika/parser/dwg/DWGParserConfig.java | 128 ++++++++
.../tika/parser/dwg/DWGReadFormatRemover.java | 106 +++++++
.../org/apache/tika/parser/dwg/DWGReadParser.java | 325 +++++++++++++++++++++
.../org/apache/tika/parser/dwg/JulianDateUtil.java | 47 +++
.../org/apache/tika/parser/dwg/DWGParserTest.java | 67 ++++-
.../tika/parser/dwg/DWGReadFormatRemoverTest.java | 73 +++++
.../test-configs/tika-config-dwgRead-Timeout.xml | 27 ++
.../resources/test-configs/tika-config-dwgRead.xml | 26 ++
...tural_-_annotation_scaling_and_multileaders.dwg | Bin 0 -> 188992 bytes
12 files changed, 971 insertions(+), 83 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/pom.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/pom.xml
index 9c7ee4937..c08ebeeee 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/pom.xml
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/pom.xml
@@ -36,6 +36,17 @@
<artifactId>tika-parser-microsoft-module</artifactId>
<version>${project.version}</version>
</dependency>
+
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ <version>${jackson.version}</version><!--$NO-MVN-MAN-VER$-->
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ <version>${jackson.version}</version><!--$NO-MVN-MAN-VER$-->
+ </dependency>
</dependencies>
<build>
<plugins>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java
new file mode 100644
index 000000000..934ec5cba
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+
+import org.apache.tika.config.Field;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+
+
+
+
+public abstract class AbstractDWGParser extends AbstractParser {
+
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 6261810259683381984L;
+ private final DWGParserConfig defaultDwgParserConfig = new
DWGParserConfig();
+
+ public void configure(ParseContext parseContext) {
+ DWGParserConfig dwgParserConfig =
parseContext.get(DWGParserConfig.class, defaultDwgParserConfig);
+ parseContext.set(DWGParserConfig.class, dwgParserConfig);
+ }
+
+
+ String getDwgReadExecutable() {
+ return defaultDwgParserConfig.getDwgReadExecutable();
+ }
+
+ @Field
+ public void setDwgReadExecutable(String dwgReadExecutable) {
+ defaultDwgParserConfig.setDwgReadExecutable(dwgReadExecutable);
+ }
+
+ boolean isCleanDwgReadOutput() {
+ return defaultDwgParserConfig.isCleanDwgReadOutput();
+ }
+
+ @Field
+ public void setCleanDwgReadOutput(boolean cleanDwgReadOutput) {
+ defaultDwgParserConfig.setCleanDwgReadOutput(cleanDwgReadOutput);
+ }
+
+ int getCleanDwgReadOutputBatchSize() {
+ return defaultDwgParserConfig.getCleanDwgReadOutputBatchSize();
+ }
+
+ @Field
+ public void setCleanDwgReadOutputBatchSize(int
cleanDwgReadOutputBatchSize) {
+
defaultDwgParserConfig.setCleanDwgReadOutputBatchSize(cleanDwgReadOutputBatchSize);
+ }
+ String getCleanDwgReadRegexToReplace() {
+ return defaultDwgParserConfig.getCleanDwgReadRegexToReplace();
+ }
+
+ @Field
+ public void setCleanDwgReadRegexToReplace(String
cleanDwgReadRegexToReplace) {
+
defaultDwgParserConfig.setCleanDwgReadRegexToReplace(cleanDwgReadRegexToReplace);
+ }
+ String getCleanDwgReadReplaceWith() {
+ return defaultDwgParserConfig.getCleanDwgReadReplaceWith();
+ }
+
+ @Field
+ public void setCleanDwgReadReplaceWith(String cleanDwgReadReplaceWith) {
+
defaultDwgParserConfig.setCleanDwgReadReplaceWith(cleanDwgReadReplaceWith);
+ }
+ long getDwgReadTimeout() {
+ return defaultDwgParserConfig.getDwgReadTimeout();
+ }
+
+ @Field
+ public void setDwgReadTimeout(long dwgReadTimeout) {
+ defaultDwgParserConfig.setDwgReadtimeout(dwgReadTimeout);
+ }
+
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 4519623fc..87b945e25 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -32,7 +32,6 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -42,7 +41,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
* Note that we use Apache POI for various parts of the processing, as
* lots of the low level string/int/short concepts are the same.
*/
-public class DWGParser extends AbstractParser {
+public class DWGParser extends AbstractDWGParser {
public static String DWG_CUSTOM_META_PREFIX = "dwg-custom:";
/**
* Serial version UID
@@ -51,84 +50,89 @@ public class DWGParser extends AbstractParser {
/**
* The order of the fields in the header
*/
- private static final Property[] HEADER_PROPERTIES_ENTRIES =
- {TikaCoreProperties.TITLE, TikaCoreProperties.DESCRIPTION,
TikaCoreProperties.CREATOR,
- TikaCoreProperties.SUBJECT, TikaCoreProperties.COMMENTS,
- TikaCoreProperties.MODIFIER, null, // Unknown?
- TikaCoreProperties.RELATION, // Hyperlink
- };
+ private static final Property[] HEADER_PROPERTIES_ENTRIES = {
TikaCoreProperties.TITLE,
+ TikaCoreProperties.DESCRIPTION, TikaCoreProperties.CREATOR,
TikaCoreProperties.SUBJECT,
+ TikaCoreProperties.COMMENTS, TikaCoreProperties.MODIFIER, null, //
Unknown?
+ TikaCoreProperties.RELATION, // Hyperlink
+ };
/**
* For the 2000 file, they're indexed
*/
- private static final Property[] HEADER_2000_PROPERTIES_ENTRIES =
- {null, TikaCoreProperties.RELATION, // 0x01
- TikaCoreProperties.TITLE, // 0x02
- TikaCoreProperties.DESCRIPTION, // 0x03
- TikaCoreProperties.CREATOR, // 0x04
- null, TikaCoreProperties.COMMENTS,// 0x06
- TikaCoreProperties.SUBJECT, // 0x07
- TikaCoreProperties.MODIFIER, // 0x08
- };
+ private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = { null,
TikaCoreProperties.RELATION, // 0x01
+ TikaCoreProperties.TITLE, // 0x02
+ TikaCoreProperties.DESCRIPTION, // 0x03
+ TikaCoreProperties.CREATOR, // 0x04
+ null, TikaCoreProperties.COMMENTS, // 0x06
+ TikaCoreProperties.SUBJECT, // 0x07
+ TikaCoreProperties.MODIFIER, // 0x08
+ };
private static final String HEADER_2000_PROPERTIES_MARKER_STR = "DWGPROPS
COOKIE";
- private static final byte[] HEADER_2000_PROPERTIES_MARKER =
- new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
+ private static final byte[] HEADER_2000_PROPERTIES_MARKER = new
byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
/**
- * How far to skip after the last standard property, before
- * we find any custom properties that might be there.
+ * How far to skip after the last standard property, before we find any
custom
+ * properties that might be there.
*/
private static final int CUSTOM_PROPERTIES_SKIP = 20;
/**
* The value of padding bytes other than 0 in some DWG files.
*/
- private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new
int[]{0x2, 0, 0, 0};
+ private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new
int[] { 0x2, 0, 0, 0 };
private static MediaType TYPE = MediaType.image("vnd.dwg");
static {
- StringUtil.putCompressedUnicode(HEADER_2000_PROPERTIES_MARKER_STR,
- HEADER_2000_PROPERTIES_MARKER, 0);
+ StringUtil.putCompressedUnicode(HEADER_2000_PROPERTIES_MARKER_STR,
HEADER_2000_PROPERTIES_MARKER, 0);
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.singleton(TYPE);
}
- public void parse(InputStream stream, ContentHandler handler, Metadata
metadata,
- ParseContext context) throws IOException, TikaException,
SAXException {
- // First up, which version of the format are we handling?
- byte[] header = new byte[128];
- IOUtils.readFully(stream, header);
- String version = new String(header, 0, 6, "US-ASCII");
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- switch (version) {
- case "AC1015":
- metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- if (skipTo2000PropertyInfoSection(stream, header)) {
- get2000Props(stream, metadata, xhtml);
- }
- break;
- case "AC1018":
- metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- if (skipToPropertyInfoSection(stream, header)) {
- get2004Props(stream, metadata, xhtml);
- }
- break;
- case "AC1027":
- case "AC1032":
- case "AC1021":
- case "AC1024":
- metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- if (skipToPropertyInfoSection(stream, header)) {
- get2007and2010Props(stream, metadata, xhtml);
- }
- break;
- default:
- throw new TikaException("Unsupported AutoCAD drawing version:
" + version);
- }
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata, ParseContext context)
+ throws IOException, TikaException, SAXException {
- xhtml.endDocument();
+ configure(context);
+ DWGParserConfig dwgc = context.get(DWGParserConfig.class);
+
+ if (!dwgc.getDwgReadExecutable().isEmpty()) {
+ DWGReadParser dwr = new DWGReadParser();
+ dwr.parse(stream, handler, metadata, context);
+ } else {
+ // First up, which version of the format are we handling?
+ byte[] header = new byte[128];
+ IOUtils.readFully(stream, header);
+ String version = new String(header, 0, 6, "US-ASCII");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
+ xhtml.startDocument();
+
+ switch (version) {
+ case "AC1015":
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipTo2000PropertyInfoSection(stream, header)) {
+ get2000Props(stream, metadata, xhtml);
+ }
+ break;
+ case "AC1018":
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipToPropertyInfoSection(stream, header)) {
+ get2004Props(stream, metadata, xhtml);
+ }
+ break;
+ case "AC1027":
+ case "AC1032":
+ case "AC1021":
+ case "AC1024":
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipToPropertyInfoSection(stream, header)) {
+ get2007and2010Props(stream, metadata, xhtml);
+ }
+ break;
+ default:
+ throw new TikaException("Unsupported AutoCAD drawing
version: " + version);
+ }
+
+ xhtml.endDocument();
+ }
}
/**
@@ -169,8 +173,7 @@ public class DWGParser extends AbstractParser {
/**
* Stored as UCS2, so 16 bit "unicode"
*/
- private void get2007and2010Props(InputStream stream, Metadata metadata,
- XHTMLContentHandler xhtml)
+ private void get2007and2010Props(InputStream stream, Metadata metadata,
XHTMLContentHandler xhtml)
throws IOException, TikaException, SAXException {
// Standard properties
for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
@@ -246,8 +249,8 @@ public class DWGParser extends AbstractParser {
}
}
- private void handleHeader(int headerNumber, String value, Metadata
metadata,
- XHTMLContentHandler xhtml) throws SAXException {
+ private void handleHeader(int headerNumber, String value, Metadata
metadata, XHTMLContentHandler xhtml)
+ throws SAXException {
if (value == null || value.length() == 0) {
return;
}
@@ -263,14 +266,13 @@ public class DWGParser extends AbstractParser {
/**
* Grab the offset, then skip there
*/
- private boolean skipToPropertyInfoSection(InputStream stream, byte[]
header)
- throws IOException, TikaException {
+ private boolean skipToPropertyInfoSection(InputStream stream, byte[]
header) throws IOException, TikaException {
// The offset is stored in the header from 0x20 onwards
long offsetToSection = EndianUtils.getLongLE(header, 0x20);
// Bounds check the offset. Some files seem to use a different format,
- // and the offset isn't available at 0x20. Until we can work out how
- // to find the offset in those files, skip them if detected
+ // and the offset isn't available at 0x20. Until we can work out how
+ // to find the offset in those files, skip them if detected
if (offsetToSection > 0xa00000l) {
// Header should never be more than 10mb into the file, something
is wrong
offsetToSection = 0;
@@ -289,8 +291,7 @@ public class DWGParser extends AbstractParser {
/**
* We think it can be anywhere...
*/
- private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[]
header)
- throws IOException {
+ private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[]
header) throws IOException {
int val = 0;
while (val != -1) {
val = stream.read();
@@ -315,11 +316,11 @@ public class DWGParser extends AbstractParser {
// There should be 4 zero bytes or
CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
byte[] padding = new byte[4];
IOUtils.readFully(stream, padding);
- if ((padding[0] == 0 && padding[1] == 0 && padding[2] == 0 &&
padding[3] == 0) ||
- (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] &&
- padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1]
&&
- padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2]
&&
- padding[3] ==
CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
+ if ((padding[0] == 0 && padding[1] == 0 && padding[2] == 0 &&
padding[3] == 0)
+ || (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0]
+ && padding[1] ==
CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1]
+ && padding[2] ==
CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2]
+ && padding[3] ==
CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
// Looks hopeful, skip on
padding = new byte[CUSTOM_PROPERTIES_SKIP];
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java
new file mode 100644
index 000000000..35300080b
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.dwg;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.utils.StringUtils;
+
+public class DWGParserConfig implements Serializable {
+
+ private static final long serialVersionUID = -7623524257255755725L;
+ private String dwgReadExecutable = "";
+ private boolean cleanDwgReadOutput = true;
+ private int cleanDwgReadOutputBatchSize = 10000000;
+ // default to 5 minutes, some large DWG's do take a while...
+ private long dwgReadTimeout = 300000;
+ // we need to remove non UTF chars and Nan's (dwgread outputs these as nan)
+ private String cleanDwgReadRegexToReplace = "[^\\x20-\\x7e]";
+ private String cleanDwgReadReplaceWith = "";
+ @SuppressWarnings("unused")
+ private boolean hasDwgRead;
+ private static final Logger LOG =
LoggerFactory.getLogger(DWGParserConfig.class);
+
+ public void initialize(Map<String, Param> params) throws
TikaConfigException {
+ hasDwgRead = hasDwgRead();
+
+ }
+
+ public boolean hasDwgRead() throws TikaConfigException {
+ // Fetch where the config says to find DWGRead
+ String dwgRead = getDwgReadExecutable();
+
+ if (!StringUtils.isBlank(dwgRead) &&
!Files.isRegularFile(Paths.get(dwgRead))) {
+ throw new TikaConfigException("DwgRead cannot be found at: " +
dwgRead);
+ }
+
+ // Try running DWGRead from there, and see if it exists + works
+ String[] checkCmd = { dwgRead };
+ boolean hasDwgRead = ExternalParser.check(checkCmd);
+ LOG.debug("hasDwgRead (path: " + Arrays.toString(checkCmd) + "): " +
hasDwgRead);
+ return hasDwgRead;
+ }
+
+ public String getDwgReadExecutable() {
+
+ return dwgReadExecutable;
+ }
+
+ public boolean isCleanDwgReadOutput() {
+ return cleanDwgReadOutput;
+ }
+
+ public int getCleanDwgReadOutputBatchSize() {
+ return cleanDwgReadOutputBatchSize;
+ }
+
+ public long getDwgReadTimeout() {
+ return dwgReadTimeout;
+ }
+
+ public String getCleanDwgReadRegexToReplace() {
+ return cleanDwgReadRegexToReplace;
+ }
+
+ public String getCleanDwgReadReplaceWith() {
+ return cleanDwgReadReplaceWith;
+ }
+
+ public void setDwgReadExecutable(String dwgReadExecutable) {
+ if (!Paths.get(dwgReadExecutable).isAbsolute())
+ try {
+ dwgReadExecutable = new
File(dwgReadExecutable).getCanonicalFile().toString();
+ } catch (IOException e) {
+ //do nothing as the error will be picked up by the DWG Parser
+ }
+
+
+ this.dwgReadExecutable = dwgReadExecutable;
+ }
+
+ public void setCleanDwgReadOutput(boolean cleanDwgReadOutput) {
+ this.cleanDwgReadOutput = cleanDwgReadOutput;
+ }
+
+ public void setCleanDwgReadOutputBatchSize(int
cleanDwgReadOutputBatchSize) {
+ this.cleanDwgReadOutputBatchSize = cleanDwgReadOutputBatchSize;
+ }
+
+ public void setDwgReadtimeout(long dwgReadtimeout) {
+ this.dwgReadTimeout = dwgReadtimeout;
+ }
+
+ public void setCleanDwgReadRegexToReplace(String
cleanDwgReadRegexToReplace) {
+ this.cleanDwgReadRegexToReplace = cleanDwgReadRegexToReplace;
+ }
+
+ public void setCleanDwgReadReplaceWith(String cleanDwgReadReplaceWith) {
+ this.cleanDwgReadReplaceWith = cleanDwgReadReplaceWith;
+ }
+
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadFormatRemover.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadFormatRemover.java
new file mode 100644
index 000000000..9a5ab4bd2
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadFormatRemover.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.dwg;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * DWGReadFormatRemover removes the formatting from the text from libredwg
files so only
+ * the raw text remains.
+ * What needs to be cleaned has been found on the following websites:
+ * <p>
+ * <a
href="https://www.cadforum.cz/en/text-formatting-codes-in-mtext-objects-tip8640">
+ *
https://www.cadforum.cz/en/text-formatting-codes-in-mtext-objects-tip8640</a>
+ * <p>
+ * <a
href="https://adndevblog.typepad.com/autocad/2017/09/dissecting-mtext-format-codes.html">
+ *
https://adndevblog.typepad.com/autocad/2017/09/dissecting-mtext-format-codes.html</a>
+ * <p>
+ */
+
+public class DWGReadFormatRemover {
+ private static final String underlineStrikeThrough =
"((?:\\\\\\\\)+|\\\\[LlOoKk])";
+ private static final String endMarks =
"((?:\\\\\\\\)+|\\\\(?:A|H|pi|pxt|pxi|pt|X|Q|f|F|W|C|T)[^;]{0,100};)";
+ private static final String newLine = "((?:\\\\\\\\)+|\\\\P)";
+ private static final String stackFrac =
"(\\\\\\\\)+|\\\\S([^/^#]{1,20})[/^#]([^;]{1,20});";
+ private static final String curlyBraces = "(\\\\)+[{}]|([{}])";
+ private static final String escapeChars = "(?<!\\\\)(\\\\)(?!\\\\)";
+ public String cleanupDwgString(String dwgString) {
+ String cleanString = dwgString;
+ StringBuffer sb = new StringBuffer();
+ //Strip off start/stop underline/overstrike/strike throughs
+ Matcher m =
Pattern.compile(underlineStrikeThrough).matcher(cleanString);
+ while (m.find()) {
+ if (! m.group(1).endsWith("\\")) {
+ m.appendReplacement(sb, "");
+ }
+ }
+ m.appendTail(sb);
+ cleanString = sb.toString();
+
+ //Strip off semi-colon ended markers
+ m = Pattern.compile(endMarks).matcher(cleanString);
+ sb.setLength(0);
+ while (m.find()) {
+ if (! m.group(1).endsWith("\\")) {
+ m.appendReplacement(sb, "");
+ }
+ }
+ m.appendTail(sb);
+ cleanString = sb.toString();
+
+ //new line marker \\P replace with actual new line
+ m = Pattern.compile(newLine).matcher(cleanString);
+ sb.setLength(0);
+ while (m.find()) {
+ if (m.group(1).endsWith("P")) {
+ m.appendReplacement(sb, "\n");
+ }
+ }
+ m.appendTail(sb);
+ cleanString = sb.toString();
+
+ //stacking fractions
+ m = Pattern.compile(stackFrac).matcher(cleanString);
+ sb.setLength(0);
+ while (m.find()) {
+ if (m.group(1) == null) {
+ m.appendReplacement(sb, m.group(2) + "/" + m.group(3));
+ }
+ }
+ m.appendTail(sb);
+ cleanString = sb.toString();
+
+ //strip brackets around text, make sure they aren't escaped
+ m = Pattern.compile(curlyBraces).matcher(cleanString);
+ sb.setLength(0);
+ while (m.find()) {
+ if (m.group(1) == null) {
+ m.appendReplacement(sb, "");
+ }
+ }
+ m.appendTail(sb);
+ cleanString = sb.toString();
+ //now get rid of escape characters
+ cleanString = cleanString.replaceAll(escapeChars, "");
+ //now unescape backslash
+ cleanString = cleanString.replaceAll("(\\\\\\\\)", "\\\\");
+ return cleanString;
+ }
+
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
new file mode 100644
index 000000000..fe9a1b663
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
@@ -0,0 +1,325 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.dwg;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.time.Instant;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Consumer;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParseException;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.core.json.JsonReadFeature;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
+
+
+
+
+
+/**
+ * DWGReadParser (CAD Drawing) parser. This extends the original DWGParser if
in
+ * the parser configuration DwgRead is set. DWG reader can be found here:
+ * <p>
+ * <a
href="https://github.com/LibreDWG/libredwg">https://github.com/LibreDWG/libredwg</a>
+ * <p>
+ * DWGRead outputs json which we then loop through extracting the text
elements
+ * The required configuration is dwgReadExecutable. The other settings which
can be
+ * overwritten are:
+ * <p>
+ * boolean : cleanDwgReadOutput - whether to clean the json output
+ * <p>
+ * int : cleanDwgReadOutputBatchSize - clean output batch size to process
+ * <p>
+ * long : dwgReadTimeout -timeout in milliseconds before killing the dwgread
process
+ * <p>
+ * String : cleanDwgReadRegexToReplace - characters to replace in the json
+ * <p>
+ * String : cleanDwgReadReplaceWith - * replacement characters
dwgReadExecutable
+ */
+
+public class DWGReadParser extends AbstractDWGParser {
+ private static final Logger LOG =
LoggerFactory.getLogger(DWGReadParser.class);
+ /**
+ *
+ */
+ private static final long serialVersionUID = 7983127145030096837L;
+ private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(TYPE);
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ configure(context);
+ DWGParserConfig dwgc = context.get(DWGParserConfig.class);
+ final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
+ xhtml.startDocument();
+ // create unique files so we avoid overwriting out files if
multithreaded
+ UUID uuid = UUID.randomUUID();
+ File tmpFileOut = File.createTempFile(uuid + "dwgreadout", ".json");
+ File tmpFileOutCleaned = File.createTempFile(uuid + "dwgreadoutclean",
".json");
+ File tmpFileIn = File.createTempFile(uuid + "dwgreadin", ".dwg");
+ try {
+
+
+ FileUtils.copyInputStreamToFile(stream, tmpFileIn);
+
+ List<String> command = Arrays.asList(dwgc.getDwgReadExecutable(),
"-O", "JSON", "-o",
+ tmpFileOut.getCanonicalPath(),
tmpFileIn.getCanonicalPath());
+ ProcessBuilder pb = new ProcessBuilder().command(command);
+ LOG.info("About to call DWGRead: " + command.toString());
+ FileProcessResult fpr = ProcessUtils.execute(pb,
dwgc.getDwgReadTimeout(), 10000, 10000);
+ LOG.info("DWGRead Exit code is: " + fpr.getExitValue());
+ if (fpr.getExitValue() == 0) {
+ if (dwgc.isCleanDwgReadOutput()) {
+ // dwgread sometimes creates strings with invalid utf-8
sequences or invalid
+ // json (nan instead of NaN). replace them
+ // with empty string.
+ LOG.debug("Cleaning Json Output - Replace: " +
dwgc.getCleanDwgReadRegexToReplace()
+ + " with: " + dwgc.getCleanDwgReadReplaceWith());
+ try ( BufferedReader br = new BufferedReader(
+ new InputStreamReader(
+
Files.newInputStream(tmpFileOut.toPath()),
+ StandardCharsets.UTF_8));
+
+ BufferedWriter out = new BufferedWriter(
+ new OutputStreamWriter(
+ new
FileOutputStream(tmpFileOutCleaned, true),
+ StandardCharsets.UTF_8),32768))
+ {
+
+ String sCurrentLine;
+ while ((sCurrentLine = br.readLine()) != null)
+ {
+ sCurrentLine = sCurrentLine
+ .replaceAll(
dwgc.getCleanDwgReadRegexToReplace(),
+
dwgc.getCleanDwgReadReplaceWith())
+ .replaceAll(" nan,", " 0,")
+ .replaceAll(" nan ", " 0 ")
+ .replaceAll("\\.,", " \\. ,") +
"\n";
+ out.write(sCurrentLine);
+ }
+
+ } finally {
+ FileUtils.deleteQuietly(tmpFileIn);
+ FileUtils.deleteQuietly(tmpFileOut);
+ tmpFileOut = tmpFileOutCleaned;
+ }
+
+ } else {
+ LOG.debug(
+ "Json wasn't cleaned, "
+ + "if json parsing fails consider reviewing
dwgread json output to check it's valid");
+ }
+ } else if (fpr.isTimeout()) {
+ throw new TikaException(
+ "DWGRead Failed - Timeout setting exceeded current
setting of " + dwgc.getDwgReadTimeout() );
+ }
+ else {
+ throw new TikaException(
+ "DWGRead Failed - Exit Code is:" + fpr.getExitValue()
+ " Exe error is: " + fpr.getStderr() );
+ }
+
+ // we can't guarantee the json output is correct so we try to
ignore as many
+ // errors as we can
+ JsonFactory jfactory = JsonFactory.builder()
+ .enable(JsonReadFeature.ALLOW_MISSING_VALUES,
+ JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS,
+
JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER,
+ JsonReadFeature.ALLOW_UNQUOTED_FIELD_NAMES,
+ JsonReadFeature.ALLOW_TRAILING_COMMA,
+ JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS,
+ JsonReadFeature.ALLOW_LEADING_ZEROS_FOR_NUMBERS)
+ .build();
+ JsonParser jParser;
+ try {
+ jParser = jfactory.createParser(tmpFileOut);
+ } catch (JsonParseException e1) {
+ throw new TikaException("Failed to parse Json: " +
ExceptionUtils.getStackTrace(e1));
+ } catch (IOException e1) {
+ throw new TikaException("Failed to read json file: " +
ExceptionUtils.getStackTrace(e1));
+ }
+ // read json token in a stream using jackson, iterate over each
token. We only
+ // support OBJECTS, FILEHEADER and SummaryInfo
+ // these are the only ones we have in either sample files or have
been tested
+ // with
+ DWGReadFormatRemover dwgReadFormatRemover = new
DWGReadFormatRemover();
+ JsonToken nextToken = jParser.nextToken();
+ while ((nextToken = jParser.nextToken()) != JsonToken.END_OBJECT) {
+ if (nextToken == JsonToken.FIELD_NAME) {
+ String nextFieldName = jParser.currentName();
+ nextToken = jParser.nextToken();
+ if (nextToken.isStructStart()) {
+
+ if ("OBJECTS".equals(nextFieldName)) {
+ // Start array
+ jParser.nextToken();
+ while (jParser.nextToken() != JsonToken.END_ARRAY)
{
+ parseDwgObject(jParser, (nextTextValue) -> {
+
+ try {
+
xhtml.characters(dwgReadFormatRemover.cleanupDwgString(nextTextValue));
+ xhtml.newline();
+ } catch (SAXException e) {
+ LOG.error("Could not write next text
value {} to xhtml stream", nextTextValue);
+ }
+ });
+ }
+ } else if ("FILEHEADER".equals(nextFieldName)) {
+ parseHeader(jParser, metadata);
+ } else if ("SummaryInfo".equals(nextFieldName)) {
+ parseSummaryInfo(jParser, metadata);
+ } else {
+ jParser.skipChildren();
+ }
+ }
+ }
+ }
+ jParser.close();
+ } finally {
+ // make sure we delete all temp files
+ FileUtils.deleteQuietly(tmpFileOut);
+ FileUtils.deleteQuietly(tmpFileIn);
+ FileUtils.deleteQuietly(tmpFileOutCleaned);
+ }
+
+ xhtml.endDocument();
+ }
+
+ private void parseDwgObject(JsonParser jsonParser, Consumer<String>
textConsumer) throws IOException {
+ JsonToken nextToken;
+ while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+ if (nextToken == JsonToken.FIELD_NAME) {
+ String nextFieldName = jsonParser.currentName();
+ nextToken = jsonParser.nextToken();
+ if (nextToken.isStructStart()) {
+ jsonParser.skipChildren();
+ } else if (nextToken.isScalarValue()) {
+ if ("text".equals(nextFieldName)) {
+ String textVal = jsonParser.getText();
+ if (StringUtils.isNotBlank(textVal)) {
+
+ textConsumer.accept(textVal);
+ }
+ } else if ("text_value".equals(nextFieldName)) {
+ String textVal = jsonParser.getText();
+ if (StringUtils.isNotBlank(textVal)) {
+
+ textConsumer.accept(textVal);
+
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private void parseHeader(JsonParser jsonParser, Metadata metadata) throws
IOException {
+ JsonToken nextToken;
+ while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+ if (nextToken == JsonToken.FIELD_NAME) {
+ String nextFieldName = jsonParser.currentName();
+ nextToken = jsonParser.nextToken();
+ if (nextToken.isStructStart()) {
+ jsonParser.skipChildren();
+ } else if (nextToken.isScalarValue()) {
+ metadata.set(nextFieldName, jsonParser.getText());
+ }
+ }
+ }
+ }
+
+ private void parseSummaryInfo(JsonParser jsonParser, Metadata metadata)
throws IOException {
+ JsonToken nextToken;
+ while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+ if (nextToken == JsonToken.FIELD_NAME) {
+ String nextFieldName = jsonParser.currentName();
+ nextToken = jsonParser.nextToken();
+ if (nextToken.isStructStart()) {
+ if ("TDCREATE".equals(nextFieldName) ||
"TDUPDATE".equals(nextFieldName)) {
+ // timestamps are represented by an integer array of
format with 2 values in the
+ // array:
+ // [julianDate, millisecondOfDay]
+ jsonParser.nextToken(); // start array
+ long julianDay = jsonParser.getValueAsLong();
+ jsonParser.nextToken();
+ long millisecondsIntoDay = jsonParser.getValueAsLong();
+ Instant instant = JulianDateUtil.toInstant(julianDay,
millisecondsIntoDay);
+ jsonParser.nextToken(); // end array
+ if ("TDCREATE".equals(nextFieldName)) {
+ metadata.set(TikaCoreProperties.CREATED,
instant.toString());
+ } else {
+ metadata.set(TikaCoreProperties.MODIFIED,
instant.toString());
+ }
+ } else {
+ jsonParser.skipChildren();
+ }
+
+ } else if (nextToken.isScalarValue()) {
+ String textVal = jsonParser.getText();
+ if (StringUtils.isNotBlank(textVal)) {
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Summary Info - {} = {}", nextFieldName,
textVal);
+ }
+ if ("TITLE".equals(nextFieldName)) {
+ metadata.set(TikaCoreProperties.TITLE, textVal);
+ } else if ("LASTSAVEDBY".equals(nextFieldName)) {
+ metadata.set(TikaCoreProperties.MODIFIER, textVal);
+ } else if
(!StringUtils.startsWithIgnoreCase(nextFieldName, "unknown")) {
+ metadata.set(nextFieldName, textVal);
+ }
+ }
+ }
+ }
+ }
+ }
+
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/JulianDateUtil.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/JulianDateUtil.java
new file mode 100644
index 000000000..522df0883
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/JulianDateUtil.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.dwg;
+
+import java.time.Instant;
+import java.time.ZoneOffset;
+import java.time.ZonedDateTime;
+import java.time.temporal.ChronoUnit;
+
+class JulianDateUtil {
+ private static final double NANOS_PER_DAY = 24.0 * 60.0 * 60.0 *
1000000000.0;
+ public static final Instant REDUCED_JD = ZonedDateTime.of(1858, 11, 16,
12, 0, 0, 0, ZoneOffset.UTC).toInstant();
+ public static final Instant JULIAN_DATE = REDUCED_JD.minus(2400000,
ChronoUnit.DAYS);
+
+ private final Instant epoch;
+
+ private JulianDateUtil(Instant epoch) {
+ super();
+ this.epoch = epoch;
+ }
+
+ private Instant toInstant(double day) {
+ long l = (long) day;
+ return epoch.plus(l, ChronoUnit.DAYS).plusNanos(Math.round((day - l) *
NANOS_PER_DAY));
+ }
+
+ public static Instant toInstant(long julianDay, long millisecondsIntoDay) {
+ return new JulianDateUtil(JulianDateUtil.JULIAN_DATE)
+ .toInstant(Double.parseDouble(julianDay + "." +
millisecondsIntoDay));
+
+ }
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
index 88807b087..077e7700f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
@@ -16,23 +16,48 @@
*/
package org.apache.tika.parser.dwg;
-import static org.apache.tika.TikaTest.assertContains;
+
+
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
+import java.io.IOException;
import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
import java.util.Arrays;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.StringUtils;
public class DWGParserTest extends TikaTest {
+ public boolean canRun(DWGParser parser) {
+ String dwgRead = parser.getDwgReadExecutable();
+
+ if (!StringUtils.isBlank(dwgRead) &&
!Files.isRegularFile(Paths.get(dwgRead))) {
+ return false;
+ }
+ // Try running DWGRead from there, and see if it exists + works
+ String[] checkCmd = { dwgRead };
+ return ExternalParser.check(checkCmd);
+
+ }
@Test
public void testDWG2000Parser() throws Exception {
InputStream input =
@@ -80,7 +105,7 @@ public class DWGParserTest extends TikaTest {
.getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
- new DWGParser().parse(input, handler, metadata, null);
+ new DWGParser().parse(input, handler, metadata,new ParseContext());
assertEquals("valueforcustomprop1",
metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX +
"customprop1"));
@@ -101,12 +126,12 @@ public class DWGParserTest extends TikaTest {
}
}
- @SuppressWarnings("deprecation")
+
private void testParser(InputStream input) throws Exception {
try {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
- new DWGParser().parse(input, handler, metadata);
+ new DWGParser().parse(input, handler, metadata,new ParseContext());
assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
@@ -129,12 +154,12 @@ public class DWGParserTest extends TikaTest {
}
}
- @SuppressWarnings("deprecation")
+
private void testParserNoHeader(InputStream input) throws Exception {
try {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
- new DWGParser().parse(input, handler, metadata);
+ new DWGParser().parse(input, handler, metadata,new ParseContext());
assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
@@ -152,12 +177,11 @@ public class DWGParserTest extends TikaTest {
}
}
- @SuppressWarnings("deprecation")
private void testParserAlt(InputStream input) throws Exception {
try {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
- new DWGParser().parse(input, handler, metadata);
+ new DWGParser().parse(input, handler, metadata, new
ParseContext());
assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
@@ -191,4 +215,31 @@ public class DWGParserTest extends TikaTest {
assertEquals("jlakshvi", metadata.get(TikaCoreProperties.MODIFIER));
assertEquals("CUSTOMER'S ADDRESS", metadata.get("dwg-custom:CUSTOMER'S
ADDRESS"));
}
+ @Test
+ public void testDWGReadexe() throws Exception {
+
+ InputStream stream =
getResourceAsStream("/test-configs/tika-config-dwgRead.xml");
+ DWGParser parser =
+ (DWGParser) ((CompositeParser) new
TikaConfig(stream).getParser())
+ .getAllComponentParsers().get(0);
+ assumeTrue(canRun(parser), "Can't run DWGRead.exe");
+ String output =
getText("architectural_-_annotation_scaling_and_multileaders.dwg", parser);
+ assertContains("ELEV. 11'-9\" TOP OF SECOND FLR.",output);
+ }
+
+ @Test
+ public void testDWGReadtimeout() throws TikaException, IOException,
SAXException {
+
+ InputStream stream =
getResourceAsStream("/test-configs/tika-config-dwgRead-Timeout.xml");
+ DWGParser parser = (DWGParser) ((CompositeParser) new
TikaConfig(stream).getParser())
+ .getAllComponentParsers().get(0);
+ assumeTrue(canRun(parser), "Can't run DWGRead.exe");
+ TikaException thrown = assertThrows(
+ TikaException.class,
+ () ->
getText("architectural_-_annotation_scaling_and_multileaders.dwg", parser),
+ "Expected getText() to throw TikaException but it failed"
+ );
+ assertTrue(thrown.getMessage().contains("Timeout setting exceeded
current setting of"));
+ }
+
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGReadFormatRemoverTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGReadFormatRemoverTest.java
new file mode 100644
index 000000000..d570a6f6d
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGReadFormatRemoverTest.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.junit.jupiter.api.Test;
+
+
+public class DWGReadFormatRemoverTest {
+ @Test
+ public void testBasic() {
+ String formatted =
"\\A1;\\fAIGDT|b0|i0;\\H2.5000;\\ln\\fArial|b0|i0;\\H2.5000;68{\\H1.3;\\S+0,8^+0,1;}";
+ DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+ String expected = "n68+0,8/+0,1";
+ assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+ }
+
+ @Test
+ public void testParameterizables() {
+ String formatted = "the quick \\A1;\\fAIGDT|b0|i0;\\H2.5000; brown
fox";
+ DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+ String expected = "the quick brown fox";
+ assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+ }
+ @Test
+ public void testEscapedSlashes() {
+ String formatted = "the quick \\\\ \\A3;\\fAIGDT|b0|i0;\\H2.5000;brown
fox";
+ DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+ String expected = "the quick \\ brown fox";
+ assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+ }
+
+ @Test
+ public void testUnderlineEtc() {
+ String formatted = "l \\L open cu\\lrly bra\\Kck\\ket \\{ and a close
" +
+ "\\} right?";
+ DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+ String expected = "l open curly bracket { and a close } right?";
+ assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+
+ }
+ @Test
+ public void testEscaped() {
+ String formatted = "then an actual \\P open curly bracket \\{ and a
close \\} right?";
+ DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+ String expected = "then an actual \n open curly bracket { and a close
} right?";
+ assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+ }
+
+ @Test
+ public void testStackedFractions() {
+ String formatted = "abc \\S+0,8^+0,1; efg";
+ DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+ String expected = "abc +0,8/+0,1 efg";
+ assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+ }
+
+}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead-Timeout.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead-Timeout.xml
new file mode 100644
index 000000000..2ae7757bf
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead-Timeout.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.dwg.DWGParser">
+ <params>
+ <param name="dwgReadExecutable"
type="string">g:/libredwg-0.12.5-win64/dwgread.exe</param>
+ <param name="dwgReadTimeout" type="long">1</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead.xml
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead.xml
new file mode 100644
index 000000000..bddfa781c
--- /dev/null
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.dwg.DWGParser">
+ <params>
+ <param name="dwgReadExecutable"
type="string">g:/libredwg-0.12.5-win64/dwgread.exe</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/architectural_-_annotation_scaling_and_multileaders.dwg
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/architectural_-_annotation_scaling_and_multileaders.dwg
new file mode 100644
index 000000000..0ad0df75a
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/architectural_-_annotation_scaling_and_multileaders.dwg
differ