[
https://issues.apache.org/jira/browse/TIKA-1735?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17536269#comment-17536269
]
ASF GitHub Bot commented on TIKA-1735:
--------------------------------------
monkmachine commented on code in PR #558:
URL: https://github.com/apache/tika/pull/558#discussion_r871666566
##########
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGReadParser.java:
##########
@@ -0,0 +1,208 @@
+package org.apache.tika.parser.dwg;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Set;
+import java.util.UUID;
+import java.util.function.Consumer;
+import org.apache.commons.lang3.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.commons.io.FileUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.core.JsonParser;
+import com.fasterxml.jackson.core.JsonToken;
+import com.fasterxml.jackson.core.json.JsonReadFeature;
+
+
+public class DWGReadParser extends AbstractDWGParser {
+ private static final Logger LOG = LoggerFactory.getLogger(DWGParser.class);
+ /**
+ *
+ */
+ private static final long serialVersionUID = 7983127145030096837L;
+ private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+ public Set < MediaType > getSupportedTypes(ParseContext context) {
+ return Collections.singleton(TYPE);
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata
metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ configure(context);
+ DWGParserConfig dwgc = context.get(DWGParserConfig.class);
+ final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
+
+ xhtml.startDocument();
+ UUID uuid = UUID.randomUUID();
+ File tmpFileOut = File.createTempFile(uuid + "dwgreadout", ".json");
+ File tmpFileOutCleaned = File.createTempFile(uuid + "dwgreadoutclean",
".json");
+ File tmpFileIn = File.createTempFile(uuid + "dwgreadin", ".dwg");
+ try {
+
+ FileUtils.copyInputStreamToFile(stream, tmpFileIn);
+
+ List < String > command =
Arrays.asList(dwgc.getDwgReadExecutable(), "-O", "JSON", "-o",
+ tmpFileOut.getCanonicalPath(), tmpFileIn.getCanonicalPath());
+ Process p = new ProcessBuilder(command).start();
+
+ try {
+ int exitCode = p.waitFor();
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+
+ if (dwgc.isCleanDwgReadOutput()) {
+ // dwgread sometimes creates strings with invalid utf-8
sequences or invalid json (nan instead of NaN). replace them
+ // with empty string.
+
+ try (FileInputStream fis = new FileInputStream(tmpFileOut);
FileOutputStream fos = new FileOutputStream(tmpFileOutCleaned)) {
+ byte[] bytes = new
byte[dwgc.getCleanDwgReadOutputBatchSize()];
+ while (fis.read(bytes) != -1) {
+ byte[] fixedBytes = new String(bytes,
StandardCharsets.UTF_8)
+ .replaceAll(dwgc.getCleanDwgReadRegexToReplace(),
dwgc.getCleanDwgReadReplaceWith())
+ .getBytes(StandardCharsets.UTF_8);
+ fos.write(fixedBytes, 0, fixedBytes.length);
+ }
+ } finally {
+ FileUtils.deleteQuietly(tmpFileOut);
+ tmpFileOut = tmpFileOutCleaned;
+ }
+
+ }
+
+ // we can't guarantee the json output is correct so we try to
ignore as many errors as we can
+ JsonFactory jfactory =
JsonFactory.builder().enable(JsonReadFeature.ALLOW_MISSING_VALUES,JsonReadFeature.ALLOW_UNESCAPED_CONTROL_CHARS,JsonReadFeature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER).build();
+ JsonParser jParser = jfactory.createParser(tmpFileOut);
+ JsonToken nextToken = jParser.nextToken();
+ while ((nextToken = jParser.nextToken()) != JsonToken.END_OBJECT) {
+ if (nextToken == JsonToken.FIELD_NAME) {
+ String nextFieldName = jParser.currentName();
+ nextToken = jParser.nextToken();
+ if (nextToken.isStructStart()) {
+
+ if ("OBJECTS".equals(nextFieldName)) {
+ // Start array
+ jParser.nextToken();
+ while (jParser.nextToken() != JsonToken.END_ARRAY)
{
+ parseDwgObject(jParser, (nextTextValue) -> {
+
+ try {
+
xhtml.characters(cleanupDwgString(nextTextValue));
+
xhtml.newline();
+ } catch
(SAXException e) {
+
LOG.error("Could not write next text value {} to xhtml stream", nextTextValue);
+ }
+ });
+ }
+ } else if ("FILEHEADER".equals(nextFieldName)) {
+ parseHeader(jParser,metadata);
+ } else {
+ jParser.skipChildren();
+ }
+ }
+ }
+ }
+ jParser.close();
+ } finally {
+ FileUtils.deleteQuietly(tmpFileOut);
Review Comment:
Added tmpFileOutCleaned to be deleted to make sure it is deleted if we
encounter exception
> Unsupported AutoCAD drawing version: AC1027
> -------------------------------------------
>
> Key: TIKA-1735
> URL: https://issues.apache.org/jira/browse/TIKA-1735
> Project: Tika
> Issue Type: Bug
> Reporter: Luca Perico
> Priority: Major
> Attachments: testDWG-AC1027.dwg
>
>
> Trying to index .dwg file (version AC1027) I get 500 error response.
> "<?xml version=""1.0"" encoding=""UTF-8""?>
> <response>
> <lst name=""responseHeader""><int name=""status"">500</int><int
> name=""QTime"">3</int></lst><lst name=""error""><str A1:F378 Unsupported
> AutoCAD drawing version: AC1027</str><str
> name=""trace"">org.apache.solr.common.SolrException:
> org.apache.tika.exception.TikaException: Unsupported AutoCAD drawing version:
> AC1027
> at
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:227)
> at
> org.apache.solr.handler.ContentStreamHandlerBase.handleRequestBody(ContentStreamHandlerBase.java:74)
> at
> org.apache.solr.handler.RequestHandlerBase.handleRequest(RequestHandlerBase.java:143)
> at org.apache.solr.core.SolrCore.execute(SolrCore.java:2064)
> at org.apache.solr.servlet.HttpSolrCall.execute(HttpSolrCall.java:654)
> at org.apache.solr.servlet.HttpSolrCall.call(HttpSolrCall.java:450)
> at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:227)
> at
> org.apache.solr.servlet.SolrDispatchFilter.doFilter(SolrDispatchFilter.java:196)
> at
> org.eclipse.jetty.servlet.ServletHandler$CachedChain.doFilter(ServletHandler.java:1652)
> at
> org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:585)
> at
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
> at
> org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:577)
> at
> org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:223)
> at
> org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1127)
> at
> org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515)
> at
> org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
> at
> org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1061)
> at
> org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
> at
> org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215)
> at
> org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:110)
> at
> org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97)
> at org.eclipse.jetty.server.Server.handle(Server.java:497)
> at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:310)
> at
> org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:257)
> at
> org.eclipse.jetty.io.AbstractConnection$2.run(AbstractConnection.java:540)
> at
> org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
> at
> org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
> at java.lang.Thread.run(Thread.java:745)
> Caused by: org.apache.tika.exception.TikaException: Unsupported AutoCAD
> drawing version: AC1027
> at org.apache.tika.parser.dwg.DWGParser.parse(DWGParser.java:131)
> at
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:256)
> at
> org.apache.tika.parser.CompositeParser.parse(CompositeParser.java:256)
> at
> org.apache.tika.parser.AutoDetectParser.parse(AutoDetectParser.java:120)
> at
> org.apache.solr.handler.extraction.ExtractingDocumentLoader.load(ExtractingDocumentLoader.java:221)
> ... 27 more
> </str><int name=""code"">500</int></lst>
> </response>"
--
This message was sent by Atlassian Jira
(v8.20.7#820007)