This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new f4bd11b TIKA-2976: Add an XLZ Parser
new 4fc63fa Merge pull request #296 from dameikle/master
f4bd11b is described below
commit f4bd11b84bd82fd072e5c4e246ba391a2ffbb5ff
Author: David Meikle <[email protected]>
AuthorDate: Tue Oct 29 11:00:01 2019 +0000
TIKA-2976: Add an XLZ Parser
---
.../org/apache/tika/mime/tika-mimetypes.xml | 6 +
.../org/apache/tika/parser/pkg/PackageParser.java | 1 +
.../org/apache/tika/parser/xliff/XLZParser.java | 146 +++++++++++++++++++++
.../services/org.apache.tika.parser.Parser | 3 +-
.../apache/tika/parser/xliff/XLZParserTest.java | 47 +++++++
.../test/resources/test-documents/testXLIFF12.xlz | Bin 0 -> 1004 bytes
6 files changed, 202 insertions(+), 1 deletion(-)
diff --git
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 0f71e41..c5ad55d 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -7447,6 +7447,12 @@
<glob pattern="*.xliff"/>
</mime-type>
+ <mime-type type="application/x-xliff+zip">
+ <sub-class-of type="application/zip"/>
+ <_comment>XLZ Archive</_comment>
+ <glob pattern="*.xlz"/>
+ </mime-type>
+
<mime-type type="text/x-rsrc">
<_comment>R source code</_comment>
<glob pattern="*.r"/>
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index b1e0f93..9da682c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -157,6 +157,7 @@ public class PackageParser extends AbstractParser {
"application/x-tika-ooxml",
"application/x-tika-ooxml-protected",
"application/x-tika-visio-ooxml",
+ "application/x-xliff+zip",
"application/x-xmind",
"model/vnd.dwfx+xps",
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLZParser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLZParser.java
new file mode 100644
index 0000000..b40be84
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLZParser.java
@@ -0,0 +1,146 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xliff;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.EndDocumentShieldingContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipInputStream;
+
+/**
+ * Parser for XLZ Archives.
+ */
+public class XLZParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -1877314028666058564L;
+
+ /**
+ * Custom XLZ mime type.
+ */
+ private static final MediaType XLZ_CONTENT_TYPE =
MediaType.application("x-xliff+zip");
+
+ /**
+ * Supported types set.
+ */
+ private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(XLZ_CONTENT_TYPE);
+
+ /**
+ * XLF Extension
+ */
+ private static final String XLF = ".xlf";
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Shared Parser instance.
+ */
+ private Parser xliffParser = new XLIFF12Parser();
+
+ public void parse(
+ InputStream stream, ContentHandler baseHandler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ ZipFile zipFile = null;
+ ZipInputStream zipStream = null;
+ if (stream instanceof TikaInputStream) {
+ TikaInputStream tis = (TikaInputStream) stream;
+ Object container = ((TikaInputStream) stream).getOpenContainer();
+ if (container instanceof ZipFile) {
+ zipFile = (ZipFile) container;
+ } else if (tis.hasFile()) {
+ zipFile = new ZipFile(tis.getFile());
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+ } else {
+ zipStream = new ZipInputStream(stream);
+ }
+
+ // Prepare to handle the content
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler,
metadata);
+ EndDocumentShieldingContentHandler handler = new
EndDocumentShieldingContentHandler(xhtml);
+ if (zipFile != null) {
+ try {
+ handleZipFile(zipFile, metadata, context, handler);
+ } finally {
+ zipFile.close();
+ }
+ } else {
+ try {
+ handleZipStream(zipStream, metadata, context, handler);
+ } finally {
+ zipStream.close();
+ }
+ }
+
+ if (handler.getEndDocumentWasCalled()) {
+ handler.reallyEndDocument();
+ }
+ }
+ private void handleZipStream(ZipInputStream zipStream, Metadata metadata,
ParseContext context,
+ EndDocumentShieldingContentHandler handler)
+ throws IOException, TikaException, SAXException {
+
+ ZipEntry entry = zipStream.getNextEntry();
+ if (entry == null) {
+ throw new IOException("No entries found in ZipInputStream");
+ }
+ while (entry != null) {
+ if (entry.getName().contains(XLF)) {
+ xliffParser.parse(zipStream, handler, metadata, context);
+ }
+ entry = zipStream.getNextEntry();
+ }
+ }
+
+ private void handleZipFile(ZipFile zipFile, Metadata metadata,
+ ParseContext context,
EndDocumentShieldingContentHandler handler)
+ throws IOException, TikaException, SAXException {
+
+ Enumeration<? extends ZipEntry> entries = zipFile.entries();
+ while (entries.hasMoreElements()) {
+ ZipEntry entry = entries.nextElement();
+ if (entry.getName().contains(XLF)) {
+ xliffParser.parse(zipFile.getInputStream(entry), handler,
metadata, context);
+ }
+ }
+ }
+
+}
diff --git
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index f4ba809..120c28b 100644
---
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -83,4 +83,5 @@ org.apache.tika.parser.external.CompositeExternalParser
org.apache.tika.parser.journal.JournalParser
org.apache.tika.parser.image.ICNSParser
org.apache.tika.parser.dbf.DBFParser
-org.apache.tika.parser.xliff.XLIFF12Parser
\ No newline at end of file
+org.apache.tika.parser.xliff.XLIFF12Parser
+org.apache.tika.parser.xliff.XLZParser
\ No newline at end of file
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLZParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLZParserTest.java
new file mode 100644
index 0000000..12f52fc
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLZParserTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xliff;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+import java.io.InputStream;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+public class XLZParserTest {
+
+ @Test
+ public void testXLZ() throws Exception {
+ try (InputStream input =
XLZParserTest.class.getResourceAsStream("/test-documents/testXLIFF12.xlz")) {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new XLZParser().parse(input, handler, metadata, new
ParseContext());
+ String content = handler.toString();
+ assertContains("Hooray", content);
+ assertEquals("2", metadata.get("file-count"));
+ assertEquals("4", metadata.get("tu-count"));
+ assertEquals("en", metadata.get("source-language"));
+ assertEquals("fr", metadata.get("target-language"));
+ }
+ }
+
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlz
b/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlz
new file mode 100644
index 0000000..bfcce7f
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlz differ