This is an automated email from the ASF dual-hosted git repository. dmeikle pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 003da648b52829fb0f19201bd6acda3687d83d31 Author: David Meikle <[email protected]> AuthorDate: Tue Oct 29 11:00:01 2019 +0000 TIKA-2976: Add an XLZ Parser (cherry picked from commit f4bd11b84bd82fd072e5c4e246ba391a2ffbb5ff) --- .../org/apache/tika/mime/tika-mimetypes.xml | 6 + .../org/apache/tika/parser/pkg/PackageParser.java | 1 + .../org/apache/tika/parser/xliff/XLZParser.java | 146 +++++++++++++++++++++ .../services/org.apache.tika.parser.Parser | 3 +- .../apache/tika/parser/xliff/XLZParserTest.java | 47 +++++++ .../test/resources/test-documents/testXLIFF12.xlz | Bin 0 -> 1004 bytes 6 files changed, 202 insertions(+), 1 deletion(-) diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml index dd7a206..d3dceb7 100644 --- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml +++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml @@ -7226,6 +7226,12 @@ <glob pattern="*.xliff"/> </mime-type> + <mime-type type="application/x-xliff+zip"> + <sub-class-of type="application/zip"/> + <_comment>XLZ Archive</_comment> + <glob pattern="*.xlz"/> + </mime-type> + <mime-type type="text/x-rsrc"> <_comment>R source code</_comment> <glob pattern="*.r"/> diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java index b06eacb..d8b2989 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/pkg/PackageParser.java @@ -157,6 +157,7 @@ public class PackageParser extends AbstractParser { "application/x-tika-ooxml", "application/x-tika-ooxml-protected", "application/x-tika-visio-ooxml", + "application/x-xliff+zip", "application/x-xmind", "model/vnd.dwfx+xps", diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLZParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLZParser.java new file mode 100644 index 0000000..b40be84 --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLZParser.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xliff; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.EndDocumentShieldingContentHandler; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.Enumeration; +import java.util.Set; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; +import java.util.zip.ZipInputStream; + +/** + * Parser for XLZ Archives. + */ +public class XLZParser extends AbstractParser { + + /** + * Serial version UID + */ + private static final long serialVersionUID = -1877314028666058564L; + + /** + * Custom XLZ mime type. + */ + private static final MediaType XLZ_CONTENT_TYPE = MediaType.application("x-xliff+zip"); + + /** + * Supported types set. + */ + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(XLZ_CONTENT_TYPE); + + /** + * XLF Extension + */ + private static final String XLF = ".xlf"; + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /** + * Shared Parser instance. + */ + private Parser xliffParser = new XLIFF12Parser(); + + public void parse( + InputStream stream, ContentHandler baseHandler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + ZipFile zipFile = null; + ZipInputStream zipStream = null; + if (stream instanceof TikaInputStream) { + TikaInputStream tis = (TikaInputStream) stream; + Object container = ((TikaInputStream) stream).getOpenContainer(); + if (container instanceof ZipFile) { + zipFile = (ZipFile) container; + } else if (tis.hasFile()) { + zipFile = new ZipFile(tis.getFile()); + } else { + zipStream = new ZipInputStream(stream); + } + } else { + zipStream = new ZipInputStream(stream); + } + + // Prepare to handle the content + XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata); + EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml); + if (zipFile != null) { + try { + handleZipFile(zipFile, metadata, context, handler); + } finally { + zipFile.close(); + } + } else { + try { + handleZipStream(zipStream, metadata, context, handler); + } finally { + zipStream.close(); + } + } + + if (handler.getEndDocumentWasCalled()) { + handler.reallyEndDocument(); + } + } + private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, + EndDocumentShieldingContentHandler handler) + throws IOException, TikaException, SAXException { + + ZipEntry entry = zipStream.getNextEntry(); + if (entry == null) { + throw new IOException("No entries found in ZipInputStream"); + } + while (entry != null) { + if (entry.getName().contains(XLF)) { + xliffParser.parse(zipStream, handler, metadata, context); + } + entry = zipStream.getNextEntry(); + } + } + + private void handleZipFile(ZipFile zipFile, Metadata metadata, + ParseContext context, EndDocumentShieldingContentHandler handler) + throws IOException, TikaException, SAXException { + + Enumeration<? extends ZipEntry> entries = zipFile.entries(); + while (entries.hasMoreElements()) { + ZipEntry entry = entries.nextElement(); + if (entry.getName().contains(XLF)) { + xliffParser.parse(zipFile.getInputStream(entry), handler, metadata, context); + } + } + } + +} diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index 63e5031..83e78d4 100644 --- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -82,4 +82,5 @@ org.apache.tika.parser.external.CompositeExternalParser org.apache.tika.parser.journal.JournalParser org.apache.tika.parser.image.ICNSParser org.apache.tika.parser.dbf.DBFParser -org.apache.tika.parser.xliff.XLIFF12Parser \ No newline at end of file +org.apache.tika.parser.xliff.XLIFF12Parser +org.apache.tika.parser.xliff.XLZParser \ No newline at end of file diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLZParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLZParserTest.java new file mode 100644 index 0000000..12f52fc --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLZParserTest.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.xliff; + +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.BodyContentHandler; +import org.junit.Test; +import org.xml.sax.ContentHandler; + +import java.io.InputStream; + +import static org.apache.tika.TikaTest.assertContains; +import static org.junit.Assert.assertEquals; + +public class XLZParserTest { + + @Test + public void testXLZ() throws Exception { + try (InputStream input = XLZParserTest.class.getResourceAsStream("/test-documents/testXLIFF12.xlz")) { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new XLZParser().parse(input, handler, metadata, new ParseContext()); + String content = handler.toString(); + assertContains("Hooray", content); + assertEquals("2", metadata.get("file-count")); + assertEquals("4", metadata.get("tu-count")); + assertEquals("en", metadata.get("source-language")); + assertEquals("fr", metadata.get("target-language")); + } + } + +} diff --git a/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlz b/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlz new file mode 100644 index 0000000..bfcce7f Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlz differ
