This is an automated email from the ASF dual-hosted git repository.
dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 7ac9eab TIKA-2975: Add parser for XLIFF v1.2 files
new 1edd25d Merge remote-tracking branch 'origin/branch_1x' into branch_1x
7ac9eab is described below
commit 7ac9eabe97f97ef33ac9a610b1f0b614d4d0f9b8
Author: David Meikle <[email protected]>
AuthorDate: Sun Oct 27 01:29:33 2019 +0000
TIKA-2975: Add parser for XLIFF v1.2 files
---
.../org/apache/tika/mime/tika-mimetypes.xml | 8 ++
.../tika/parser/xliff/XLIFF12ContentHandler.java | 116 +++++++++++++++++++++
.../apache/tika/parser/xliff/XLIFF12Parser.java | 77 ++++++++++++++
.../services/org.apache.tika.parser.Parser | 1 +
.../tika/parser/xliff/XLIFF12ParserTest.java | 48 +++++++++
.../test/resources/test-documents/testXLIFF12.xlf | 35 +++++++
6 files changed, 285 insertions(+)
diff --git
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index fc34cf8..b252081 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -7219,6 +7219,14 @@
<sub-class-of type="text/plain"/>
</mime-type>
+ <mime-type type="application/x-xliff+xml">
+ <sub-class-of type="application/xml"/>
+ <_comment>XLIFF 1.2 document</_comment>
+ <root-XML namespaceURI="urn:oasis:names:tc:xliff:document:1.2"
localName="xliff"/>
+ <glob pattern="*.xlf"/>
+ <glob pattern="*.xliff"/>
+ </mime-type>
+
<mime-type type="text/x-rsrc">
<_comment>R source code</_comment>
<glob pattern="*.r"/>
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
new file mode 100644
index 0000000..954c217
--- /dev/null
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xliff;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Content Handler for XLIFF 1.2 documents.
+ */
+public class XLIFF12ContentHandler extends DefaultHandler {
+
+ private int numberOfFiles = 0;
+ private int numberOfTUs = 0;
+ private boolean inTransUnit = false;
+ private final XHTMLContentHandler xhtml;
+ private final Metadata metadata;
+
+ XLIFF12ContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+ this.xhtml = xhtml;
+ this.metadata = metadata;
+ }
+
+ @Override
+ public void startElement(
+ String uri, String localName, String qName, Attributes attributes)
+ throws SAXException {
+
+ final AttributesImpl attributeVals = new AttributesImpl();
+ attributeVals.setAttributes(attributes);
+
+ if ("file".equals(localName)) {
+ numberOfFiles++;
+
+ // Write out the original file name
+ metadata.add("original", attributes.getValue("source-language"));
+
+ xhtml.startElement("div");
+ xhtml.startElement("h1");
+ xhtml.characters(attributes.getValue("original"));
+ xhtml.endElement("h1");
+
+ // Add the files source and target languages
+ metadata.add("source-language",
attributes.getValue("source-language"));
+ metadata.add("target-language",
attributes.getValue("target-language"));
+ }
+
+ if ("trans-unit".equals(localName)) {
+ numberOfTUs++;
+ inTransUnit = true;
+ xhtml.startElement("div", attributeVals);
+ }
+
+ if ("source".equals(localName)) {
+ xhtml.startElement("p", attributeVals);
+ }
+
+ if ("target".equals(localName)) {
+ xhtml.startElement("p", attributeVals);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws
SAXException {
+
+ if ("file".equals(localName)) {
+ xhtml.endElement("div");
+ }
+
+ if ("trans-unit".equals(localName)) {
+ inTransUnit = false;
+ xhtml.endElement("div");
+ }
+
+ if ("source".equals(localName)) {
+ xhtml.endElement("p");
+ }
+
+ if ("target".equals(localName)) {
+ xhtml.endElement("p");
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length)
+ throws SAXException {
+ if (inTransUnit && length != 0) {
+ xhtml.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void endDocument() {
+ metadata.set("file-count", String.valueOf(numberOfFiles));
+ metadata.set("tu-count", String.valueOf(numberOfTUs));
+ }
+
+}
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
new file mode 100644
index 0000000..40218b0
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xliff;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * Parser for XLIFF 1.2 files.
+ */
+public class XLIFF12Parser extends AbstractParser {
+
+ /**
+ * Serial Version UID.
+ */
+ private static final long serialVersionUID = 1490085649251663857L;
+
+ /**
+ * Pre-Xliff 2.0 mime type.
+ */
+ private static final MediaType XLF_CONTENT_TYPE =
MediaType.application("x-xliff+xml");
+
+ /**
+ * Supported types set.
+ */
+ private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(XLF_CONTENT_TYPE);
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ metadata.set(Metadata.CONTENT_TYPE, XLF_CONTENT_TYPE.toString());
+
+ final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
+ XMLReaderUtils.parseSAX(
+ new CloseShieldInputStream(stream),
+ new OfflineContentHandler(new XLIFF12ContentHandler(xhtml,
metadata)),
+ context);
+
+ }
+
+}
diff --git
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index a33a578..d6edd56 100644
---
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -81,3 +81,4 @@ org.apache.tika.parser.external.CompositeExternalParser
org.apache.tika.parser.journal.JournalParser
org.apache.tika.parser.image.ICNSParser
org.apache.tika.parser.dbf.DBFParser
+org.apache.tika.parser.xliff.XLIFF12Parser
\ No newline at end of file
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
new file mode 100644
index 0000000..9f69ea5
--- /dev/null
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xliff;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class XLIFF12ParserTest {
+
+ @Test
+ public void testXLIFF12() throws Exception {
+ try (InputStream input =
XLIFF12ParserTest.class.getResourceAsStream("/test-documents/testXLIFF12.xlf"))
{
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new XLIFF12Parser().parse(input, handler, metadata, new
ParseContext());
+ String content = handler.toString();
+ assertContains("Hooray", content);
+ assertEquals("2", metadata.get("file-count"));
+ assertEquals("4", metadata.get("tu-count"));
+ assertEquals("en", metadata.get("source-language"));
+ assertEquals("fr", metadata.get("target-language"));
+ }
+ }
+
+
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlf
b/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlf
new file mode 100644
index 0000000..13b01ce
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlf
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xliff version="1.2"
+ xmlns="urn:oasis:names:tc:xliff:document:1.2"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="urn:oasis:names:tc:xliff:document:1.2
xliff-core-1.2-strict.xsd">
+ <file original="sample.properties"
+ source-language="en" target-language="fr"
+ datatype="javapropertyresourcebundle">
+ <body>
+ <trans-unit id="1" resname="welcome">
+ <source xml:lang="en">Hooray, you're here! The day just got
better - enjoy the following tips!</source>
+ <target xml:lang="fr">Hurrah, tu es là. Ça fait une bonne
journée - profite des renseignements suivants !</target>
+ </trans-unit>
+ <trans-unit id="2" resname="text_segment">
+ <source xml:lang="en">A section of text like this is known as
a text segment. Start rockin' your translations now!</source>
+ <target xml:lang="fr">Un tel extrait est qualifié de segment
du texte chez nous. C’est parti – commence tes traductions tout de
suite/maintenant !</target>
+ </trans-unit>
+ <trans-unit id="3" resname="tab_shortcut">
+ <source xml:lang="en">Arriba, Arriba! Andale, Andale! Be fast
as Speedy Gonzales. Just hit TAB to save and go to the next text segment, once
you're done.</source>
+ <target xml:lang="fr">Arriba, Arriba ! Andale, Andale ! Prends
Speedy Gonzales de vitesse. Appuie sur TAB pour sauvegarder et continuer avec
le segment prochain dès que tu as fini.</target>
+ </trans-unit>
+ </body>
+ </file>
+
+ <file original="sample2.properties"
+ source-language="en" target-language="fr"
+ datatype="javapropertyresourcebundle">
+ <body>
+ <trans-unit id="4" resname="welcome">
+ <source xml:lang="en">Another trans-unit</source>
+ <target xml:lang="fr">Un autre trans-unit</target>
+ </trans-unit>
+ </body>
+ </file>
+</xliff>
\ No newline at end of file