[ 
https://issues.apache.org/jira/browse/TIKA-1735?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17536269#comment-17536269
 ] 

ASF GitHub Bot commented on TIKA-1735:
--------------------------------------

monkmachine commented on code in PR #558:
URL: https://github.com/apache/tika/pull/558#discussion_r871666566


##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java:
##########
@@ -0,0 +1,208 @@
+package org.apache.tika.parser.dwg;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Consumer;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.core.json.JsonReadFeature;
+
+
+public class DWGReadParser extends AbstractDWGParser {
+    private static final Logger LOG = LoggerFactory.getLogger(DWGParser.class);
+    /**
+     * 
+     */
+    private static final long serialVersionUID = 7983127145030096837L;
+    private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+    public Set < MediaType > getSupportedTypes(ParseContext context) {
+        return Collections.singleton(TYPE);
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata, ParseContext context)
+    throws IOException, SAXException, TikaException {
+
+        configure(context);
+        DWGParserConfig dwgc = context.get(DWGParserConfig.class);
+        final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+
+        xhtml.startDocument();
+        UUID uuid = UUID.randomUUID();
+        File tmpFileOut = File.createTempFile(uuid + "dwgreadout", ".json");
+        File tmpFileOutCleaned = File.createTempFile(uuid + "dwgreadoutclean", 
".json");
+        File tmpFileIn = File.createTempFile(uuid + "dwgreadin", ".dwg");
+        try {
+
+            FileUtils.copyInputStreamToFile(stream, tmpFileIn);
+
+            List < String > command = 
Arrays.asList(dwgc.getDwgReadExecutable(), "-O", "JSON", "-o",
+                tmpFileOut.getCanonicalPath(), tmpFileIn.getCanonicalPath());
+            Process p = new ProcessBuilder(command).start();
+
+            try {
+                int exitCode = p.waitFor();
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+
+            if (dwgc.isCleanDwgReadOutput()) {
+                // dwgread sometimes creates strings with invalid utf-8 
sequences or invalid json (nan instead of NaN). replace them
+                // with empty string.
+
+                try (FileInputStream fis = new FileInputStream(tmpFileOut); 
FileOutputStream fos = new FileOutputStream(tmpFileOutCleaned)) {
+                    byte[] bytes = new 
byte[dwgc.getCleanDwgReadOutputBatchSize()];
+                    while (fis.read(bytes) != -1) {
+                        byte[] fixedBytes = new String(bytes, 
StandardCharsets.UTF_8)
+                            .replaceAll(dwgc.getCleanDwgReadRegexToReplace(), 
dwgc.getCleanDwgReadReplaceWith())
+                            .getBytes(StandardCharsets.UTF_8);
+                        fos.write(fixedBytes, 0, fixedBytes.length);
+                    }
+                } finally {
+                    FileUtils.deleteQuietly(tmpFileOut);
+                    tmpFileOut = tmpFileOutCleaned;
+                }
+
+            }
+            
+            // we can't guarantee the json output is correct so we try to 
ignore as many errors as we can
+            JsonFactory jfactory = 
JsonFactory.builder().enable(JsonReadFeature.ALLOW_MISSING_VALUES,JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS,JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER).build();
+            JsonParser jParser = jfactory.createParser(tmpFileOut);
+            JsonToken nextToken = jParser.nextToken();
+            while ((nextToken = jParser.nextToken()) != JsonToken.END_OBJECT) {
+                if (nextToken == JsonToken.FIELD_NAME) {
+                    String nextFieldName = jParser.currentName();
+                    nextToken = jParser.nextToken();
+                    if (nextToken.isStructStart()) {
+
+                        if ("OBJECTS".equals(nextFieldName)) {
+                            // Start array
+                            jParser.nextToken();
+                            while (jParser.nextToken() != JsonToken.END_ARRAY) 
{
+                                parseDwgObject(jParser, (nextTextValue) -> {
+
+                                    try {
+                                                                               
xhtml.characters(cleanupDwgString(nextTextValue));
+                                                                               
xhtml.newline();
+                                                                       } catch 
(SAXException e) {
+                                                                               
LOG.error("Could not write next text value {} to xhtml stream", nextTextValue);
+                                                                       }
+                                });
+                            }
+                        }  else if ("FILEHEADER".equals(nextFieldName)) {
+                            parseHeader(jParser,metadata);
+                        } else {
+                            jParser.skipChildren();
+                        }
+                    }
+                }
+            }
+            jParser.close();
+        } finally {
+            FileUtils.deleteQuietly(tmpFileOut);

Review Comment:
   Added tmpFileOutCleaned to be deleted to make sure it is deleted if we 
encounter exception





> Unsupported AutoCAD drawing version: AC1027
> -------------------------------------------
>
>                 Key: TIKA-1735
>                 URL: https://issues.apache.org/jira/browse/TIKA-1735
>             Project: Tika
>          Issue Type: Bug
>            Reporter: Luca Perico
>            Priority: Major
>         Attachments: testDWG-AC1027.dwg
>
>
> Trying to index .dwg file (version AC1027) I get 500 error response. 
> "<?xml version=""1.0"" encoding=""UTF-8""?>
> <response>
> <lst name=""responseHeader""><int name=""status"">500</int><int 
> name=""QTime"">3</int></lst><lst name=""error""><str A1:F378 Unsupported 
> AutoCAD drawing version: AC1027</str><str 
> name=""trace"">org.apache.solr.common.SolrException: 
> org.apache.tika.exception.TikaException: Unsupported AutoCAD drawing version: 
> AC1027
>       at 
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:227)
>       at 
> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
>       at 
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:143)
>       at org.apache.solr.core.SolrCore.execute(SolrCore.java:2064)
>       at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:654)
>       at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:450)
>       at 
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:227)
>       at 
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:196)
>       at 
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
>       at 
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:585)
>       at 
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
>       at 
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:577)
>       at 
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:223)
>       at 
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1127)
>       at 
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515)
>       at 
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
>       at 
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1061)
>       at 
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
>       at 
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215)
>       at 
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:110)
>       at 
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97)
>       at org.eclipse.jetty.server.Server.handle(Server.java:497)
>       at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:310)
>       at 
> org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:257)
>       at 
> org.eclipse.jetty.io.AbstractConnection$2.run(AbstractConnection.java:540)
>       at 
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
>       at 
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
>       at java.lang.Thread.run(Thread.java:745)
> Caused by: org.apache.tika.exception.TikaException: Unsupported AutoCAD 
> drawing version: AC1027
>       at org.apache.tika.parser.dwg.DWGParser.parse(DWGParser.java:131)
>       at 
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:256)
>       at 
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:256)
>       at 
> org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
>       at 
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:221)
>       ... 27 more
> </str><int name=""code"">500</int></lst>
> </response>"



--
This message was sent by Atlassian Jira
(v8.20.7#820007)

Reply via email to