[tika] 01/02: TIKA-2976: Add an XLZ Parser

dmeikle Tue, 29 Oct 2019 04:08:39 -0700

This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 003da648b52829fb0f19201bd6acda3687d83d31
Author: David Meikle <[email protected]>
AuthorDate: Tue Oct 29 11:00:01 2019 +0000

    TIKA-2976: Add an XLZ Parser
    
    (cherry picked from commit f4bd11b84bd82fd072e5c4e246ba391a2ffbb5ff)
---
 .../org/apache/tika/mime/tika-mimetypes.xml        |   6 +
 .../org/apache/tika/parser/pkg/PackageParser.java  |   1 +
 .../org/apache/tika/parser/xliff/XLZParser.java    | 146 +++++++++++++++++++++
 .../services/org.apache.tika.parser.Parser         |   3 +-
 .../apache/tika/parser/xliff/XLZParserTest.java    |  47 +++++++
 .../test/resources/test-documents/testXLIFF12.xlz  | Bin 0 -> 1004 bytes
 6 files changed, 202 insertions(+), 1 deletion(-)

diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index dd7a206..d3dceb7 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -7226,6 +7226,12 @@
     <glob pattern="*.xliff"/>
   </mime-type>
 
+  <mime-type type="application/x-xliff+zip">
+    <sub-class-of type="application/zip"/>
+    <_comment>XLZ Archive</_comment>
+    <glob pattern="*.xlz"/>
+  </mime-type>
+
   <mime-type type="text/x-rsrc">
     <_comment>R source code</_comment>
     <glob pattern="*.r"/>
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index b06eacb..d8b2989 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -157,6 +157,7 @@ public class PackageParser extends AbstractParser {
                 "application/x-tika-ooxml",
                 "application/x-tika-ooxml-protected",
                 "application/x-tika-visio-ooxml",
+                "application/x-xliff+zip",
                 "application/x-xmind",
                 "model/vnd.dwfx+xps",
 
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLZParser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLZParser.java
new file mode 100644
index 0000000..b40be84
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLZParser.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xliff;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+/**
+ * Parser for XLZ Archives.
+ */
+public class XLZParser extends AbstractParser {
+
+    /**
+     * Serial version UID
+     */
+    private static final long serialVersionUID = -1877314028666058564L;
+
+    /**
+     * Custom XLZ mime type.
+     */
+    private static final MediaType XLZ_CONTENT_TYPE = 
MediaType.application("x-xliff+zip");
+
+    /**
+     * Supported types set.
+     */
+    private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(XLZ_CONTENT_TYPE);
+
+    /**
+     * XLF Extension
+     */
+    private static final String XLF = ".xlf";
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    /**
+     * Shared Parser instance.
+     */
+    private Parser xliffParser = new XLIFF12Parser();
+
+    public void parse(
+            InputStream stream, ContentHandler baseHandler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        ZipFile zipFile = null;
+        ZipInputStream zipStream = null;
+        if (stream instanceof TikaInputStream) {
+            TikaInputStream tis = (TikaInputStream) stream;
+            Object container = ((TikaInputStream) stream).getOpenContainer();
+            if (container instanceof ZipFile) {
+                zipFile = (ZipFile) container;
+            } else if (tis.hasFile()) {
+                zipFile = new ZipFile(tis.getFile());
+            } else {
+                zipStream = new ZipInputStream(stream);
+            }
+        } else {
+            zipStream = new ZipInputStream(stream);
+        }
+
+        // Prepare to handle the content
+        XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, 
metadata);
+        EndDocumentShieldingContentHandler handler = new 
EndDocumentShieldingContentHandler(xhtml);
+        if (zipFile != null) {
+            try {
+                handleZipFile(zipFile, metadata, context, handler);
+            } finally {
+                zipFile.close();
+            }
+        } else {
+            try {
+                handleZipStream(zipStream, metadata, context, handler);
+            } finally {
+                zipStream.close();
+            }
+        }
+
+        if (handler.getEndDocumentWasCalled()) {
+            handler.reallyEndDocument();
+        }
+    }
+    private void handleZipStream(ZipInputStream zipStream, Metadata metadata, 
ParseContext context,
+                                 EndDocumentShieldingContentHandler handler)
+            throws IOException, TikaException, SAXException {
+
+        ZipEntry entry = zipStream.getNextEntry();
+        if (entry == null) {
+            throw new IOException("No entries found in ZipInputStream");
+        }
+        while (entry != null) {
+            if (entry.getName().contains(XLF)) {
+                xliffParser.parse(zipStream, handler, metadata, context);
+            }
+            entry = zipStream.getNextEntry();
+        }
+    }
+
+    private void handleZipFile(ZipFile zipFile, Metadata metadata,
+                               ParseContext context, 
EndDocumentShieldingContentHandler handler)
+            throws IOException, TikaException, SAXException {
+
+        Enumeration<? extends ZipEntry> entries = zipFile.entries();
+        while (entries.hasMoreElements()) {
+            ZipEntry entry = entries.nextElement();
+            if (entry.getName().contains(XLF)) {
+                xliffParser.parse(zipFile.getInputStream(entry), handler, 
metadata, context);
+            }
+        }
+    }
+
+}
diff --git 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 63e5031..83e78d4 100644
--- 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -82,4 +82,5 @@ org.apache.tika.parser.external.CompositeExternalParser
 org.apache.tika.parser.journal.JournalParser
 org.apache.tika.parser.image.ICNSParser
 org.apache.tika.parser.dbf.DBFParser
-org.apache.tika.parser.xliff.XLIFF12Parser
\ No newline at end of file
+org.apache.tika.parser.xliff.XLIFF12Parser
+org.apache.tika.parser.xliff.XLZParser
\ No newline at end of file
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLZParserTest.java 
b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLZParserTest.java
new file mode 100644
index 0000000..12f52fc
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLZParserTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xliff;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+import java.io.InputStream;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+public class XLZParserTest {
+
+    @Test
+    public void testXLZ() throws Exception {
+        try (InputStream input = 
XLZParserTest.class.getResourceAsStream("/test-documents/testXLIFF12.xlz")) {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new XLZParser().parse(input, handler, metadata, new 
ParseContext());
+            String content = handler.toString();
+            assertContains("Hooray", content);
+            assertEquals("2", metadata.get("file-count"));
+            assertEquals("4", metadata.get("tu-count"));
+            assertEquals("en", metadata.get("source-language"));
+            assertEquals("fr", metadata.get("target-language"));
+        }
+    }
+
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlz 
b/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlz
new file mode 100644
index 0000000..bfcce7f
Binary files /dev/null and 
b/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlz differ

[tika] 01/02: TIKA-2976: Add an XLZ Parser

Reply via email to