This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 75d88149b TIKA-1735 - Adding DWGRead parser to Tika if available (#558)
75d88149b is described below

commit 75d88149bc479d79b02c986e86333be11f697692
Author: Dan Coldrick <[email protected]>
AuthorDate: Thu Oct 6 16:38:50 2022 +0100

    TIKA-1735 - Adding DWGRead parser to Tika if available (#558)
    
    * Initial commit for review.
    
    * Resolving tmpFileOutCleaned to make sure it is deleted even if we
    encounter an exception
    
    * Added ProcessUtils.execute instead of calling process directly and fixed
    checkstyle
    
    * Added Summary Info from Nicholas DiPiazza
    
    * Added tests back in, added initialize for dwgread to check exists
    
    * Attempt at cleaning up the DWG strings with regex
    
    * Fixed some of the broken regexes
    
    * Cleaned up code for checkstyle
    
    * Fixed Nan as was replacing with "" instead of 0
    
    * Added buffer reader and added new regexes
    
    * Amended Regexes, fixed config default, fixed Julian Date
    
    * Update DWGParser.java
    
    * Fixed DWGParser
    
    * Fixed DWGParser Test
    
    * Fixed Configs
    
    * Added new classs for cleaning up format
    
    * Check Style fixes
    
    * Added Tests and fixed DWGReadFormatRemover
    
    * Added Tests and fixed DWGReadFormatRemover
    
    Check Style fixes
    
    * Fixed CheckStyle Issues
    
    * Added Timeout test
    
    * Added Timeout test
    
    Co-authored-by: monkm <[email protected]>
---
 .../tika-parser-cad-module/pom.xml                 |  11 +
 .../apache/tika/parser/dwg/AbstractDWGParser.java  |  93 ++++++
 .../java/org/apache/tika/parser/dwg/DWGParser.java | 151 +++++-----
 .../apache/tika/parser/dwg/DWGParserConfig.java    | 128 ++++++++
 .../tika/parser/dwg/DWGReadFormatRemover.java      | 106 +++++++
 .../org/apache/tika/parser/dwg/DWGReadParser.java  | 325 +++++++++++++++++++++
 .../org/apache/tika/parser/dwg/JulianDateUtil.java |  47 +++
 .../org/apache/tika/parser/dwg/DWGParserTest.java  |  67 ++++-
 .../tika/parser/dwg/DWGReadFormatRemoverTest.java  |  73 +++++
 .../test-configs/tika-config-dwgRead-Timeout.xml   |  27 ++
 .../resources/test-configs/tika-config-dwgRead.xml |  26 ++
 ...tural_-_annotation_scaling_and_multileaders.dwg | Bin 0 -> 188992 bytes
 12 files changed, 971 insertions(+), 83 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/pom.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/pom.xml
index 9c7ee4937..c08ebeeee 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/pom.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/pom.xml
@@ -36,6 +36,17 @@
       <artifactId>tika-parser-microsoft-module</artifactId>
       <version>${project.version}</version>
     </dependency>
+
+    <dependency>
+       <groupId>com.fasterxml.jackson.core</groupId>
+       <artifactId>jackson-core</artifactId>
+           <version>${jackson.version}</version><!--$NO-MVN-MAN-VER$-->
+    </dependency>
+        <dependency>
+       <groupId>com.fasterxml.jackson.core</groupId>
+       <artifactId>jackson-databind</artifactId>
+           <version>${jackson.version}</version><!--$NO-MVN-MAN-VER$-->
+    </dependency>
   </dependencies>
   <build>
     <plugins>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java
new file mode 100644
index 000000000..934ec5cba
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/AbstractDWGParser.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+
+import org.apache.tika.config.Field;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+
+
+
+
+public abstract class AbstractDWGParser extends AbstractParser {
+
+
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 6261810259683381984L;
+    private final DWGParserConfig defaultDwgParserConfig = new 
DWGParserConfig();
+
+    public void configure(ParseContext parseContext) {
+        DWGParserConfig dwgParserConfig =  
parseContext.get(DWGParserConfig.class, defaultDwgParserConfig);
+        parseContext.set(DWGParserConfig.class, dwgParserConfig);
+    }
+
+
+    String getDwgReadExecutable() {
+        return defaultDwgParserConfig.getDwgReadExecutable();
+    }
+    
+    @Field
+    public void setDwgReadExecutable(String dwgReadExecutable) {
+        defaultDwgParserConfig.setDwgReadExecutable(dwgReadExecutable);
+    }
+    
+    boolean isCleanDwgReadOutput() {
+        return defaultDwgParserConfig.isCleanDwgReadOutput();
+    }
+    
+    @Field
+    public void setCleanDwgReadOutput(boolean cleanDwgReadOutput) {
+        defaultDwgParserConfig.setCleanDwgReadOutput(cleanDwgReadOutput);
+    }
+    
+    int getCleanDwgReadOutputBatchSize() {
+        return defaultDwgParserConfig.getCleanDwgReadOutputBatchSize();
+    }
+    
+    @Field
+    public void setCleanDwgReadOutputBatchSize(int 
cleanDwgReadOutputBatchSize) {
+        
defaultDwgParserConfig.setCleanDwgReadOutputBatchSize(cleanDwgReadOutputBatchSize);
+    }
+    String getCleanDwgReadRegexToReplace() {
+        return defaultDwgParserConfig.getCleanDwgReadRegexToReplace();
+    }
+    
+    @Field
+    public void setCleanDwgReadRegexToReplace(String 
cleanDwgReadRegexToReplace) {
+        
defaultDwgParserConfig.setCleanDwgReadRegexToReplace(cleanDwgReadRegexToReplace);
+    }
+    String getCleanDwgReadReplaceWith() {
+        return defaultDwgParserConfig.getCleanDwgReadReplaceWith();
+    }
+    
+    @Field
+    public void setCleanDwgReadReplaceWith(String cleanDwgReadReplaceWith) {
+        
defaultDwgParserConfig.setCleanDwgReadReplaceWith(cleanDwgReadReplaceWith);
+    }
+    long getDwgReadTimeout() {
+        return defaultDwgParserConfig.getDwgReadTimeout();
+    }
+
+    @Field
+    public void setDwgReadTimeout(long dwgReadTimeout) {
+        defaultDwgParserConfig.setDwgReadtimeout(dwgReadTimeout);
+    }
+    
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 4519623fc..87b945e25 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -32,7 +32,6 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
 
@@ -42,7 +41,7 @@ import org.apache.tika.sax.XHTMLContentHandler;
  * Note that we use Apache POI for various parts of the processing, as
  * lots of the low level string/int/short concepts are the same.
  */
-public class DWGParser extends AbstractParser {
+public class DWGParser extends AbstractDWGParser {
     public static String DWG_CUSTOM_META_PREFIX = "dwg-custom:";
     /**
      * Serial version UID
@@ -51,84 +50,89 @@ public class DWGParser extends AbstractParser {
     /**
      * The order of the fields in the header
      */
-    private static final Property[] HEADER_PROPERTIES_ENTRIES =
-            {TikaCoreProperties.TITLE, TikaCoreProperties.DESCRIPTION, 
TikaCoreProperties.CREATOR,
-                    TikaCoreProperties.SUBJECT, TikaCoreProperties.COMMENTS,
-                    TikaCoreProperties.MODIFIER, null, // Unknown?
-                    TikaCoreProperties.RELATION, // Hyperlink
-            };
+    private static final Property[] HEADER_PROPERTIES_ENTRIES = { 
TikaCoreProperties.TITLE,
+            TikaCoreProperties.DESCRIPTION, TikaCoreProperties.CREATOR, 
TikaCoreProperties.SUBJECT,
+            TikaCoreProperties.COMMENTS, TikaCoreProperties.MODIFIER, null, // 
Unknown?
+            TikaCoreProperties.RELATION, // Hyperlink
+    };
     /**
      * For the 2000 file, they're indexed
      */
-    private static final Property[] HEADER_2000_PROPERTIES_ENTRIES =
-            {null, TikaCoreProperties.RELATION, // 0x01
-                    TikaCoreProperties.TITLE,    // 0x02
-                    TikaCoreProperties.DESCRIPTION,  // 0x03
-                    TikaCoreProperties.CREATOR,   // 0x04
-                    null, TikaCoreProperties.COMMENTS,// 0x06
-                    TikaCoreProperties.SUBJECT,    // 0x07
-                    TikaCoreProperties.MODIFIER, // 0x08
-            };
+    private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = { null, 
TikaCoreProperties.RELATION, // 0x01
+            TikaCoreProperties.TITLE, // 0x02
+            TikaCoreProperties.DESCRIPTION, // 0x03
+            TikaCoreProperties.CREATOR, // 0x04
+            null, TikaCoreProperties.COMMENTS, // 0x06
+            TikaCoreProperties.SUBJECT, // 0x07
+            TikaCoreProperties.MODIFIER, // 0x08
+    };
     private static final String HEADER_2000_PROPERTIES_MARKER_STR = "DWGPROPS 
COOKIE";
-    private static final byte[] HEADER_2000_PROPERTIES_MARKER =
-            new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
+    private static final byte[] HEADER_2000_PROPERTIES_MARKER = new 
byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
     /**
-     * How far to skip after the last standard property, before
-     * we find any custom properties that might be there.
+     * How far to skip after the last standard property, before we find any 
custom
+     * properties that might be there.
      */
     private static final int CUSTOM_PROPERTIES_SKIP = 20;
     /**
      * The value of padding bytes other than 0 in some DWG files.
      */
-    private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new 
int[]{0x2, 0, 0, 0};
+    private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new 
int[] { 0x2, 0, 0, 0 };
     private static MediaType TYPE = MediaType.image("vnd.dwg");
 
     static {
-        StringUtil.putCompressedUnicode(HEADER_2000_PROPERTIES_MARKER_STR,
-                HEADER_2000_PROPERTIES_MARKER, 0);
+        StringUtil.putCompressedUnicode(HEADER_2000_PROPERTIES_MARKER_STR, 
HEADER_2000_PROPERTIES_MARKER, 0);
     }
 
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return Collections.singleton(TYPE);
     }
 
-    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata,
-                      ParseContext context) throws IOException, TikaException, 
SAXException {
-        // First up, which version of the format are we handling?
-        byte[] header = new byte[128];
-        IOUtils.readFully(stream, header);
-        String version = new String(header, 0, 6, "US-ASCII");
-
-        XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
-        xhtml.startDocument();
-
-        switch (version) {
-            case "AC1015":
-                metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
-                if (skipTo2000PropertyInfoSection(stream, header)) {
-                    get2000Props(stream, metadata, xhtml);
-                }
-                break;
-            case "AC1018":
-                metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
-                if (skipToPropertyInfoSection(stream, header)) {
-                    get2004Props(stream, metadata, xhtml);
-                }
-                break;
-            case "AC1027":
-            case "AC1032":
-            case "AC1021":
-            case "AC1024":
-                metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
-                if (skipToPropertyInfoSection(stream, header)) {
-                    get2007and2010Props(stream, metadata, xhtml);
-                }
-                break;
-            default:
-                throw new TikaException("Unsupported AutoCAD drawing version: 
" + version);
-        }
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws IOException, TikaException, SAXException {
 
-        xhtml.endDocument();
+        configure(context);
+        DWGParserConfig dwgc = context.get(DWGParserConfig.class);
+
+        if (!dwgc.getDwgReadExecutable().isEmpty()) {
+            DWGReadParser dwr = new DWGReadParser();
+            dwr.parse(stream, handler, metadata, context);
+        } else {
+            // First up, which version of the format are we handling?
+            byte[] header = new byte[128];
+            IOUtils.readFully(stream, header);
+            String version = new String(header, 0, 6, "US-ASCII");
+
+            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+            xhtml.startDocument();
+
+            switch (version) {
+                case "AC1015":
+                    metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+                    if (skipTo2000PropertyInfoSection(stream, header)) {
+                        get2000Props(stream, metadata, xhtml);
+                    }
+                    break;
+                case "AC1018":
+                    metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+                    if (skipToPropertyInfoSection(stream, header)) {
+                        get2004Props(stream, metadata, xhtml);
+                    }
+                    break;
+                case "AC1027":
+                case "AC1032":
+                case "AC1021":
+                case "AC1024":
+                    metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+                    if (skipToPropertyInfoSection(stream, header)) {
+                        get2007and2010Props(stream, metadata, xhtml);
+                    }
+                    break;
+                default:
+                    throw new TikaException("Unsupported AutoCAD drawing 
version: " + version);
+            }
+
+            xhtml.endDocument();
+        }
     }
 
     /**
@@ -169,8 +173,7 @@ public class DWGParser extends AbstractParser {
     /**
      * Stored as UCS2, so 16 bit "unicode"
      */
-    private void get2007and2010Props(InputStream stream, Metadata metadata,
-                                     XHTMLContentHandler xhtml)
+    private void get2007and2010Props(InputStream stream, Metadata metadata, 
XHTMLContentHandler xhtml)
             throws IOException, TikaException, SAXException {
         // Standard properties
         for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
@@ -246,8 +249,8 @@ public class DWGParser extends AbstractParser {
         }
     }
 
-    private void handleHeader(int headerNumber, String value, Metadata 
metadata,
-                              XHTMLContentHandler xhtml) throws SAXException {
+    private void handleHeader(int headerNumber, String value, Metadata 
metadata, XHTMLContentHandler xhtml)
+            throws SAXException {
         if (value == null || value.length() == 0) {
             return;
         }
@@ -263,14 +266,13 @@ public class DWGParser extends AbstractParser {
     /**
      * Grab the offset, then skip there
      */
-    private boolean skipToPropertyInfoSection(InputStream stream, byte[] 
header)
-            throws IOException, TikaException {
+    private boolean skipToPropertyInfoSection(InputStream stream, byte[] 
header) throws IOException, TikaException {
         // The offset is stored in the header from 0x20 onwards
         long offsetToSection = EndianUtils.getLongLE(header, 0x20);
 
         // Bounds check the offset. Some files seem to use a different format,
-        //  and the offset isn't available at 0x20. Until we can work out how
-        //  to find the offset in those files, skip them if detected
+        // and the offset isn't available at 0x20. Until we can work out how
+        // to find the offset in those files, skip them if detected
         if (offsetToSection > 0xa00000l) {
             // Header should never be more than 10mb into the file, something 
is wrong
             offsetToSection = 0;
@@ -289,8 +291,7 @@ public class DWGParser extends AbstractParser {
     /**
      * We think it can be anywhere...
      */
-    private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] 
header)
-            throws IOException {
+    private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] 
header) throws IOException {
         int val = 0;
         while (val != -1) {
             val = stream.read();
@@ -315,11 +316,11 @@ public class DWGParser extends AbstractParser {
         // There should be 4 zero bytes or 
CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
         byte[] padding = new byte[4];
         IOUtils.readFully(stream, padding);
-        if ((padding[0] == 0 && padding[1] == 0 && padding[2] == 0 && 
padding[3] == 0) ||
-                (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] &&
-                        padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] 
&&
-                        padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] 
&&
-                        padding[3] == 
CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
+        if ((padding[0] == 0 && padding[1] == 0 && padding[2] == 0 && 
padding[3] == 0)
+                || (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0]
+                        && padding[1] == 
CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1]
+                        && padding[2] == 
CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2]
+                        && padding[3] == 
CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
 
             // Looks hopeful, skip on
             padding = new byte[CUSTOM_PROPERTIES_SKIP];
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java
new file mode 100644
index 000000000..35300080b
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.dwg;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Serializable;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+import java.util.Arrays;
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.utils.StringUtils;
+
+public class DWGParserConfig implements Serializable {
+
+    private static final long serialVersionUID = -7623524257255755725L;
+    private String dwgReadExecutable = "";
+    private boolean cleanDwgReadOutput = true;
+    private int cleanDwgReadOutputBatchSize = 10000000;
+    // default to 5 minutes, some large DWG's do take a while...
+    private long dwgReadTimeout = 300000;
+    // we need to remove non UTF chars and Nan's (dwgread outputs these as nan)
+    private String cleanDwgReadRegexToReplace = "[^\\x20-\\x7e]";
+    private String cleanDwgReadReplaceWith = "";
+    @SuppressWarnings("unused") 
+    private boolean hasDwgRead;
+    private static final Logger LOG = 
LoggerFactory.getLogger(DWGParserConfig.class);
+
+    public void initialize(Map<String, Param> params) throws 
TikaConfigException {
+        hasDwgRead = hasDwgRead();
+
+    }
+
+    public boolean hasDwgRead() throws TikaConfigException {
+        // Fetch where the config says to find DWGRead
+        String dwgRead = getDwgReadExecutable();
+
+        if (!StringUtils.isBlank(dwgRead) && 
!Files.isRegularFile(Paths.get(dwgRead))) {
+            throw new TikaConfigException("DwgRead cannot be found at: " + 
dwgRead);
+        }
+
+        // Try running DWGRead from there, and see if it exists + works
+        String[] checkCmd = { dwgRead };
+        boolean hasDwgRead = ExternalParser.check(checkCmd);
+        LOG.debug("hasDwgRead (path: " + Arrays.toString(checkCmd) + "): " + 
hasDwgRead);
+        return hasDwgRead;
+    }
+
+    public String getDwgReadExecutable() {
+
+        return dwgReadExecutable;
+    }
+
+    public boolean isCleanDwgReadOutput() {
+        return cleanDwgReadOutput;
+    }
+
+    public int getCleanDwgReadOutputBatchSize() {
+        return cleanDwgReadOutputBatchSize;
+    }
+
+    public long getDwgReadTimeout() {
+        return dwgReadTimeout;
+    }
+
+    public String getCleanDwgReadRegexToReplace() {
+        return cleanDwgReadRegexToReplace;
+    }
+
+    public String getCleanDwgReadReplaceWith() {
+        return cleanDwgReadReplaceWith;
+    }
+
+    public void setDwgReadExecutable(String dwgReadExecutable) {
+        if (!Paths.get(dwgReadExecutable).isAbsolute())
+            try {
+                dwgReadExecutable =   new 
File(dwgReadExecutable).getCanonicalFile().toString();
+            } catch (IOException e) {
+                //do nothing as the error will be picked up by the DWG Parser
+            }
+
+
+        this.dwgReadExecutable = dwgReadExecutable;
+    }
+
+    public void setCleanDwgReadOutput(boolean cleanDwgReadOutput) {
+        this.cleanDwgReadOutput = cleanDwgReadOutput;
+    }
+
+    public void setCleanDwgReadOutputBatchSize(int 
cleanDwgReadOutputBatchSize) {
+        this.cleanDwgReadOutputBatchSize = cleanDwgReadOutputBatchSize;
+    }
+
+    public void setDwgReadtimeout(long dwgReadtimeout) {
+        this.dwgReadTimeout = dwgReadtimeout;
+    }
+
+    public void setCleanDwgReadRegexToReplace(String 
cleanDwgReadRegexToReplace) {
+        this.cleanDwgReadRegexToReplace = cleanDwgReadRegexToReplace;
+    }
+
+    public void setCleanDwgReadReplaceWith(String cleanDwgReadReplaceWith) {
+        this.cleanDwgReadReplaceWith = cleanDwgReadReplaceWith;
+    }
+
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadFormatRemover.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadFormatRemover.java
new file mode 100644
index 000000000..9a5ab4bd2
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadFormatRemover.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.dwg;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * DWGReadFormatRemover removes the formatting from the text from libredwg 
files so only
+ * the raw text remains.
+ * What needs to be cleaned has been found on the following websites:
+ * <p>
+ * <a 
href="https://www.cadforum.cz/en/text-formatting-codes-in-mtext-objects-tip8640";>
+ * 
https://www.cadforum.cz/en/text-formatting-codes-in-mtext-objects-tip8640</a>
+ * <p>
+ * <a 
href="https://adndevblog.typepad.com/autocad/2017/09/dissecting-mtext-format-codes.html";>
+ * 
https://adndevblog.typepad.com/autocad/2017/09/dissecting-mtext-format-codes.html</a>
+ * <p>
+ */
+
+public class DWGReadFormatRemover {
+    private static final String underlineStrikeThrough = 
"((?:\\\\\\\\)+|\\\\[LlOoKk])";
+    private static final String endMarks = 
"((?:\\\\\\\\)+|\\\\(?:A|H|pi|pxt|pxi|pt|X|Q|f|F|W|C|T)[^;]{0,100};)";
+    private static final String newLine = "((?:\\\\\\\\)+|\\\\P)";
+    private static final  String stackFrac = 
"(\\\\\\\\)+|\\\\S([^/^#]{1,20})[/^#]([^;]{1,20});";
+    private static final String curlyBraces = "(\\\\)+[{}]|([{}])";
+    private static final String escapeChars = "(?<!\\\\)(\\\\)(?!\\\\)";
+    public String cleanupDwgString(String dwgString) {
+        String cleanString = dwgString;
+        StringBuffer sb = new StringBuffer();
+        //Strip off start/stop underline/overstrike/strike throughs
+        Matcher m = 
Pattern.compile(underlineStrikeThrough).matcher(cleanString);
+        while (m.find()) {
+            if (! m.group(1).endsWith("\\")) {
+                m.appendReplacement(sb, "");
+            }
+        }
+        m.appendTail(sb);
+        cleanString = sb.toString();
+
+        //Strip off semi-colon ended markers
+        m = Pattern.compile(endMarks).matcher(cleanString);
+        sb.setLength(0);
+        while (m.find()) {
+            if (! m.group(1).endsWith("\\")) {
+                m.appendReplacement(sb, "");
+            }
+        }
+        m.appendTail(sb);
+        cleanString = sb.toString();
+
+            //new line marker \\P replace with actual new line
+        m = Pattern.compile(newLine).matcher(cleanString);
+        sb.setLength(0);
+        while (m.find()) {
+            if (m.group(1).endsWith("P")) {
+                m.appendReplacement(sb, "\n");
+            }
+        }
+        m.appendTail(sb);
+        cleanString = sb.toString();
+
+            //stacking fractions
+        m = Pattern.compile(stackFrac).matcher(cleanString);
+        sb.setLength(0);
+        while (m.find()) {
+            if (m.group(1) == null) {
+                m.appendReplacement(sb, m.group(2) + "/" + m.group(3));
+            }
+        }
+        m.appendTail(sb);
+        cleanString = sb.toString();
+
+        //strip brackets around text, make sure they aren't escaped
+        m = Pattern.compile(curlyBraces).matcher(cleanString);
+        sb.setLength(0);
+        while (m.find()) {
+            if (m.group(1) == null) {
+                m.appendReplacement(sb, "");
+            }
+        }
+        m.appendTail(sb);
+        cleanString = sb.toString();
+            //now get rid of escape characters
+        cleanString = cleanString.replaceAll(escapeChars, "");
+        //now unescape backslash
+        cleanString = cleanString.replaceAll("(\\\\\\\\)", "\\\\");
+        return cleanString;
+    }
+
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
new file mode 100644
index 000000000..fe9a1b663
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java
@@ -0,0 +1,325 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.dwg;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.time.Instant;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Consumer;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParseException;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.core.json.JsonReadFeature;
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.utils.FileProcessResult;
+import org.apache.tika.utils.ProcessUtils;
+
+
+
+
+
+/**
+ * DWGReadParser (CAD Drawing) parser. This extends the original DWGParser if 
in 
+ * the parser configuration DwgRead is set. DWG reader can be found here: 
+ * <p>
+ * <a 
href="https://github.com/LibreDWG/libredwg";>https://github.com/LibreDWG/libredwg</a>
+ * <p>
+ * DWGRead outputs json which we then loop through extracting the text 
elements 
+ * The required configuration is dwgReadExecutable. The other settings which 
can be
+ * overwritten are: 
+ * <p>
+ * boolean : cleanDwgReadOutput - whether to clean the json output 
+ * <p>
+ * int : cleanDwgReadOutputBatchSize - clean output batch size to process 
+ * <p>
+ * long : dwgReadTimeout -timeout in milliseconds before killing the dwgread 
process
+ * <p>
+ * String : cleanDwgReadRegexToReplace - characters to replace in the json 
+ * <p>
+ * String : cleanDwgReadReplaceWith - * replacement characters 
dwgReadExecutable
+ */
+
+public class DWGReadParser extends AbstractDWGParser {
+    private static final Logger LOG = 
LoggerFactory.getLogger(DWGReadParser.class);
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 7983127145030096837L;
+    private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return Collections.singleton(TYPE);
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        configure(context);
+        DWGParserConfig dwgc = context.get(DWGParserConfig.class);
+        final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+        xhtml.startDocument();
+        // create unique files so we avoid overwriting out files if 
multithreaded
+        UUID uuid = UUID.randomUUID();
+        File tmpFileOut = File.createTempFile(uuid + "dwgreadout", ".json");
+        File tmpFileOutCleaned = File.createTempFile(uuid + "dwgreadoutclean", 
".json");
+        File tmpFileIn = File.createTempFile(uuid + "dwgreadin", ".dwg");
+        try {
+            
+
+            FileUtils.copyInputStreamToFile(stream, tmpFileIn);
+
+            List<String> command = Arrays.asList(dwgc.getDwgReadExecutable(), 
"-O", "JSON", "-o",
+                    tmpFileOut.getCanonicalPath(), 
tmpFileIn.getCanonicalPath());
+            ProcessBuilder pb = new ProcessBuilder().command(command);
+            LOG.info("About to call DWGRead: " + command.toString());
+            FileProcessResult fpr = ProcessUtils.execute(pb, 
dwgc.getDwgReadTimeout(), 10000, 10000);
+            LOG.info("DWGRead Exit code is: " + fpr.getExitValue());
+            if (fpr.getExitValue() == 0) {
+                if (dwgc.isCleanDwgReadOutput()) {
+                    // dwgread sometimes creates strings with invalid utf-8 
sequences or invalid
+                    // json (nan instead of NaN). replace them
+                    // with empty string.
+                    LOG.debug("Cleaning Json Output - Replace: " + 
dwgc.getCleanDwgReadRegexToReplace() 
+                              + " with: " + dwgc.getCleanDwgReadReplaceWith());
+                    try ( BufferedReader br = new BufferedReader(
+                              new InputStreamReader(
+                                      
Files.newInputStream(tmpFileOut.toPath()),
+                              StandardCharsets.UTF_8));
+                            
+                            BufferedWriter out = new BufferedWriter(
+                                    new OutputStreamWriter(
+                                            new 
FileOutputStream(tmpFileOutCleaned, true), 
+                                            StandardCharsets.UTF_8),32768))
+                    {
+
+                        String sCurrentLine;
+                        while ((sCurrentLine = br.readLine()) != null) 
+                        {
+                            sCurrentLine = sCurrentLine
+                                            .replaceAll( 
dwgc.getCleanDwgReadRegexToReplace(), 
+                                                    
dwgc.getCleanDwgReadReplaceWith())
+                                            .replaceAll(" nan,", " 0,")
+                                            .replaceAll(" nan ", " 0 ")
+                                            .replaceAll("\\.,", " \\. ,") + 
"\n";
+                            out.write(sCurrentLine);
+                        }                            
+                                 
+                    } finally {
+                        FileUtils.deleteQuietly(tmpFileIn);
+                        FileUtils.deleteQuietly(tmpFileOut);
+                        tmpFileOut = tmpFileOutCleaned;
+                    }
+
+                } else {
+                    LOG.debug(
+                            "Json wasn't cleaned, "
+                            + "if json parsing fails consider reviewing 
dwgread json output to check it's valid");
+                }
+            } else if (fpr.isTimeout()) {
+                throw new TikaException(
+                        "DWGRead Failed - Timeout setting exceeded current 
setting of " + dwgc.getDwgReadTimeout() );
+            }
+            else {
+                throw new TikaException(
+                        "DWGRead Failed - Exit Code is:" + fpr.getExitValue() 
+ " Exe error is: " + fpr.getStderr() );
+            }
+
+            // we can't guarantee the json output is correct so we try to 
ignore as many
+            // errors as we can
+            JsonFactory jfactory = JsonFactory.builder()
+                    .enable(JsonReadFeature.ALLOW_MISSING_VALUES, 
+                            JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS,
+                            
JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, 
+                            JsonReadFeature.ALLOW_UNQUOTED_FIELD_NAMES, 
+                            JsonReadFeature.ALLOW_TRAILING_COMMA,
+                            JsonReadFeature.ALLOW_NON_NUMERIC_NUMBERS, 
+                            JsonReadFeature.ALLOW_LEADING_ZEROS_FOR_NUMBERS)
+                    .build();
+            JsonParser jParser;
+            try {
+                jParser = jfactory.createParser(tmpFileOut);
+            } catch (JsonParseException e1) {
+                throw new TikaException("Failed to parse Json: " + 
ExceptionUtils.getStackTrace(e1));
+            } catch (IOException e1) {
+                throw new TikaException("Failed to read json file: " + 
ExceptionUtils.getStackTrace(e1));
+            }
+            // read json token in a stream using jackson, iterate over each 
token. We only
+            // support OBJECTS, FILEHEADER and SummaryInfo
+            // these are the only ones we have in either sample files or have 
been tested
+            // with
+            DWGReadFormatRemover dwgReadFormatRemover = new 
DWGReadFormatRemover();
+            JsonToken nextToken = jParser.nextToken();
+            while ((nextToken = jParser.nextToken()) != JsonToken.END_OBJECT) {
+                if (nextToken == JsonToken.FIELD_NAME) {
+                    String nextFieldName = jParser.currentName();
+                    nextToken = jParser.nextToken();
+                    if (nextToken.isStructStart()) {
+
+                        if ("OBJECTS".equals(nextFieldName)) {
+                            // Start array
+                            jParser.nextToken();
+                            while (jParser.nextToken() != JsonToken.END_ARRAY) 
{
+                                parseDwgObject(jParser, (nextTextValue) -> {
+
+                                    try {
+                                        
xhtml.characters(dwgReadFormatRemover.cleanupDwgString(nextTextValue));
+                                        xhtml.newline();
+                                    } catch (SAXException e) {
+                                        LOG.error("Could not write next text 
value {} to xhtml stream", nextTextValue);
+                                    }
+                                });
+                            }
+                        } else if ("FILEHEADER".equals(nextFieldName)) {
+                            parseHeader(jParser, metadata);
+                        } else if ("SummaryInfo".equals(nextFieldName)) {
+                            parseSummaryInfo(jParser, metadata);
+                        } else {
+                            jParser.skipChildren();
+                        }
+                    }
+                }
+            }
+            jParser.close();
+        } finally {
+            // make sure we delete all temp files
+            FileUtils.deleteQuietly(tmpFileOut);
+            FileUtils.deleteQuietly(tmpFileIn);
+            FileUtils.deleteQuietly(tmpFileOutCleaned);
+        }
+
+        xhtml.endDocument();
+    }
+
+    private void parseDwgObject(JsonParser jsonParser, Consumer<String> 
textConsumer) throws IOException {
+        JsonToken nextToken;
+        while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+            if (nextToken == JsonToken.FIELD_NAME) {
+                String nextFieldName = jsonParser.currentName();
+                nextToken = jsonParser.nextToken();
+                if (nextToken.isStructStart()) {
+                    jsonParser.skipChildren();
+                } else if (nextToken.isScalarValue()) {
+                    if ("text".equals(nextFieldName)) {
+                        String textVal = jsonParser.getText();
+                        if (StringUtils.isNotBlank(textVal)) {
+
+                            textConsumer.accept(textVal);
+                        }
+                    } else if ("text_value".equals(nextFieldName)) {
+                        String textVal = jsonParser.getText();
+                        if (StringUtils.isNotBlank(textVal)) {
+
+                            textConsumer.accept(textVal);
+
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    private void parseHeader(JsonParser jsonParser, Metadata metadata) throws 
IOException {
+        JsonToken nextToken;
+        while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+            if (nextToken == JsonToken.FIELD_NAME) {
+                String nextFieldName = jsonParser.currentName();
+                nextToken = jsonParser.nextToken();
+                if (nextToken.isStructStart()) {
+                    jsonParser.skipChildren();
+                } else if (nextToken.isScalarValue()) {
+                    metadata.set(nextFieldName, jsonParser.getText());
+                }
+            }
+        }
+    }
+
+    private void parseSummaryInfo(JsonParser jsonParser, Metadata metadata) 
throws IOException {
+        JsonToken nextToken;
+        while ((nextToken = jsonParser.nextToken()) != JsonToken.END_OBJECT) {
+            if (nextToken == JsonToken.FIELD_NAME) {
+                String nextFieldName = jsonParser.currentName();
+                nextToken = jsonParser.nextToken();
+                if (nextToken.isStructStart()) {
+                    if ("TDCREATE".equals(nextFieldName) || 
"TDUPDATE".equals(nextFieldName)) {
+                        // timestamps are represented by an integer array of 
format with 2 values in the
+                        // array:
+                        // [julianDate, millisecondOfDay]
+                        jsonParser.nextToken(); // start array
+                        long julianDay = jsonParser.getValueAsLong();
+                        jsonParser.nextToken();
+                        long millisecondsIntoDay = jsonParser.getValueAsLong();
+                        Instant instant = JulianDateUtil.toInstant(julianDay, 
millisecondsIntoDay);
+                        jsonParser.nextToken(); // end array
+                        if ("TDCREATE".equals(nextFieldName)) {
+                            metadata.set(TikaCoreProperties.CREATED, 
instant.toString());
+                        } else {
+                            metadata.set(TikaCoreProperties.MODIFIED, 
instant.toString());
+                        }
+                    } else {
+                        jsonParser.skipChildren();
+                    }
+
+                } else if (nextToken.isScalarValue()) {
+                    String textVal = jsonParser.getText();
+                    if (StringUtils.isNotBlank(textVal)) {
+                        if (LOG.isDebugEnabled()) {
+                            LOG.debug("Summary Info - {} = {}", nextFieldName, 
textVal);
+                        }
+                        if ("TITLE".equals(nextFieldName)) {
+                            metadata.set(TikaCoreProperties.TITLE, textVal);
+                        } else if ("LASTSAVEDBY".equals(nextFieldName)) {
+                            metadata.set(TikaCoreProperties.MODIFIER, textVal);
+                        } else if 
(!StringUtils.startsWithIgnoreCase(nextFieldName, "unknown")) {
+                            metadata.set(nextFieldName, textVal);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/JulianDateUtil.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/JulianDateUtil.java
new file mode 100644
index 000000000..522df0883
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/JulianDateUtil.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.dwg;
+
+import java.time.Instant;
+import java.time.ZoneOffset;
+import java.time.ZonedDateTime;
+import java.time.temporal.ChronoUnit;
+
+class JulianDateUtil {
+    private static final double NANOS_PER_DAY = 24.0 * 60.0 * 60.0 * 
1000000000.0;
+    public static final Instant REDUCED_JD = ZonedDateTime.of(1858, 11, 16, 
12, 0, 0, 0, ZoneOffset.UTC).toInstant();
+    public static final Instant JULIAN_DATE = REDUCED_JD.minus(2400000, 
ChronoUnit.DAYS);
+
+    private final Instant epoch;
+
+    private JulianDateUtil(Instant epoch) {
+        super();
+        this.epoch = epoch;
+    }
+
+    private Instant toInstant(double day) {
+        long l = (long) day;
+        return epoch.plus(l, ChronoUnit.DAYS).plusNanos(Math.round((day - l) * 
NANOS_PER_DAY));
+    }
+
+    public static Instant toInstant(long julianDay, long millisecondsIntoDay) {
+        return new JulianDateUtil(JulianDateUtil.JULIAN_DATE)
+                .toInstant(Double.parseDouble(julianDay + "." + 
millisecondsIntoDay));
+         
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
index 88807b087..077e7700f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java
@@ -16,23 +16,48 @@
  */
 package org.apache.tika.parser.dwg;
 
-import static org.apache.tika.TikaTest.assertContains;
+
+
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
+import java.io.IOException;
 import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.Arrays;
 
 import org.junit.jupiter.api.Test;
 import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.StringUtils;
 
 public class DWGParserTest extends TikaTest {
+    public boolean canRun(DWGParser parser)  {
+        String dwgRead = parser.getDwgReadExecutable();
+
+        if (!StringUtils.isBlank(dwgRead) && 
!Files.isRegularFile(Paths.get(dwgRead))) {
+            return false;
+        }
 
+        // Try running DWGRead from there, and see if it exists + works
+        String[] checkCmd = { dwgRead };
+        return ExternalParser.check(checkCmd);
+
+    }
     @Test
     public void testDWG2000Parser() throws Exception {
         InputStream input =
@@ -80,7 +105,7 @@ public class DWGParserTest extends TikaTest {
                 
.getResourceAsStream("/test-documents/testDWG2010_custom_props.dwg")) {
             Metadata metadata = new Metadata();
             ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata, null);
+            new DWGParser().parse(input, handler, metadata,new ParseContext());
 
             assertEquals("valueforcustomprop1",
                     metadata.get(DWGParser.DWG_CUSTOM_META_PREFIX + 
"customprop1"));
@@ -101,12 +126,12 @@ public class DWGParserTest extends TikaTest {
         }
     }
 
-    @SuppressWarnings("deprecation")
+
     private void testParser(InputStream input) throws Exception {
         try {
             Metadata metadata = new Metadata();
             ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata);
+            new DWGParser().parse(input, handler, metadata,new ParseContext());
 
             assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
 
@@ -129,12 +154,12 @@ public class DWGParserTest extends TikaTest {
         }
     }
 
-    @SuppressWarnings("deprecation")
+
     private void testParserNoHeader(InputStream input) throws Exception {
         try {
             Metadata metadata = new Metadata();
             ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata);
+            new DWGParser().parse(input, handler, metadata,new ParseContext());
 
             assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
 
@@ -152,12 +177,11 @@ public class DWGParserTest extends TikaTest {
         }
     }
 
-    @SuppressWarnings("deprecation")
     private void testParserAlt(InputStream input) throws Exception {
         try {
             Metadata metadata = new Metadata();
             ContentHandler handler = new BodyContentHandler();
-            new DWGParser().parse(input, handler, metadata);
+            new DWGParser().parse(input, handler, metadata, new 
ParseContext());
 
             assertEquals("image/vnd.dwg", metadata.get(Metadata.CONTENT_TYPE));
 
@@ -191,4 +215,31 @@ public class DWGParserTest extends TikaTest {
         assertEquals("jlakshvi", metadata.get(TikaCoreProperties.MODIFIER));
         assertEquals("CUSTOMER'S ADDRESS", metadata.get("dwg-custom:CUSTOMER'S 
ADDRESS"));
     }
+    @Test
+    public void testDWGReadexe() throws Exception {
+
+        InputStream stream = 
getResourceAsStream("/test-configs/tika-config-dwgRead.xml");
+        DWGParser parser =
+                (DWGParser) ((CompositeParser) new 
TikaConfig(stream).getParser())
+                        .getAllComponentParsers().get(0);
+        assumeTrue(canRun(parser), "Can't run DWGRead.exe");
+        String output = 
getText("architectural_-_annotation_scaling_and_multileaders.dwg", parser);
+        assertContains("ELEV. 11'-9\" TOP OF SECOND FLR.",output);
+    }
+
+    @Test
+    public void testDWGReadtimeout() throws TikaException, IOException, 
SAXException {
+
+        InputStream stream = 
getResourceAsStream("/test-configs/tika-config-dwgRead-Timeout.xml");
+        DWGParser parser = (DWGParser) ((CompositeParser) new 
TikaConfig(stream).getParser())
+                    .getAllComponentParsers().get(0);
+        assumeTrue(canRun(parser), "Can't run DWGRead.exe");
+        TikaException thrown = assertThrows(
+                TikaException.class,
+                () -> 
getText("architectural_-_annotation_scaling_and_multileaders.dwg", parser),
+                "Expected getText() to throw TikaException but it failed"
+        );
+        assertTrue(thrown.getMessage().contains("Timeout setting exceeded 
current setting of"));
+    }
+
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGReadFormatRemoverTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGReadFormatRemoverTest.java
new file mode 100644
index 000000000..d570a6f6d
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGReadFormatRemoverTest.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+import org.junit.jupiter.api.Test;
+
+
+public class DWGReadFormatRemoverTest {
+    @Test
+    public void testBasic()  {
+        String formatted = 
"\\A1;\\fAIGDT|b0|i0;\\H2.5000;\\ln\\fArial|b0|i0;\\H2.5000;68{\\H1.3;\\S+0,8^+0,1;}";
+        DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+        String expected = "n68+0,8/+0,1";
+        assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+    }
+
+    @Test
+    public void testParameterizables()  {
+        String formatted = "the quick \\A1;\\fAIGDT|b0|i0;\\H2.5000; brown 
fox";
+        DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+        String expected = "the quick  brown fox";
+        assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+    }
+    @Test
+    public void testEscapedSlashes()  {
+        String formatted = "the quick \\\\ \\A3;\\fAIGDT|b0|i0;\\H2.5000;brown 
fox";
+        DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+        String expected = "the quick \\ brown fox";
+        assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+    }
+
+    @Test
+    public void testUnderlineEtc()  {
+        String formatted = "l \\L open cu\\lrly bra\\Kck\\ket \\{ and a close 
" +
+                "\\} right?";
+        DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+        String expected = "l  open curly bracket { and a close } right?";
+        assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+
+    }
+    @Test
+    public void testEscaped()  {
+        String formatted = "then an actual \\P open curly bracket \\{ and a 
close \\} right?";
+        DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+        String expected = "then an actual \n open curly bracket { and a close 
} right?";
+        assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+    }
+
+    @Test
+    public void testStackedFractions()  {
+        String formatted = "abc \\S+0,8^+0,1; efg";
+        DWGReadFormatRemover dwgReadFormatter = new DWGReadFormatRemover();
+        String expected = "abc +0,8/+0,1 efg";
+        assertEquals(expected, dwgReadFormatter.cleanupDwgString(formatted));
+    }
+
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead-Timeout.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead-Timeout.xml
new file mode 100644
index 000000000..2ae7757bf
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead-Timeout.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.dwg.DWGParser">
+            <params>
+                <param name="dwgReadExecutable" 
type="string">g:/libredwg-0.12.5-win64/dwgread.exe</param>
+                <param name="dwgReadTimeout" type="long">1</param>
+             </params>
+        </parser>
+    </parsers>
+</properties>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead.xml
new file mode 100644
index 000000000..bddfa781c
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-configs/tika-config-dwgRead.xml
@@ -0,0 +1,26 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.dwg.DWGParser">
+            <params>
+                <param name="dwgReadExecutable" 
type="string">g:/libredwg-0.12.5-win64/dwgread.exe</param>
+             </params>
+        </parser>
+    </parsers>
+</properties>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/architectural_-_annotation_scaling_and_multileaders.dwg
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/architectural_-_annotation_scaling_and_multileaders.dwg
new file mode 100644
index 000000000..0ad0df75a
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/resources/test-documents/architectural_-_annotation_scaling_and_multileaders.dwg
 differ

Reply via email to