This is an automated email from the ASF dual-hosted git repository.

dmeikle pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 7ac9eab  TIKA-2975: Add parser for XLIFF v1.2 files
     new 1edd25d  Merge remote-tracking branch 'origin/branch_1x' into branch_1x
7ac9eab is described below

commit 7ac9eabe97f97ef33ac9a610b1f0b614d4d0f9b8
Author: David Meikle <[email protected]>
AuthorDate: Sun Oct 27 01:29:33 2019 +0000

    TIKA-2975: Add parser for XLIFF v1.2 files
---
 .../org/apache/tika/mime/tika-mimetypes.xml        |   8 ++
 .../tika/parser/xliff/XLIFF12ContentHandler.java   | 116 +++++++++++++++++++++
 .../apache/tika/parser/xliff/XLIFF12Parser.java    |  77 ++++++++++++++
 .../services/org.apache.tika.parser.Parser         |   1 +
 .../tika/parser/xliff/XLIFF12ParserTest.java       |  48 +++++++++
 .../test/resources/test-documents/testXLIFF12.xlf  |  35 +++++++
 6 files changed, 285 insertions(+)

diff --git 
a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml 
b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index fc34cf8..b252081 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -7219,6 +7219,14 @@
     <sub-class-of type="text/plain"/>
   </mime-type>
 
+  <mime-type type="application/x-xliff+xml">
+    <sub-class-of type="application/xml"/>
+    <_comment>XLIFF 1.2 document</_comment>
+    <root-XML namespaceURI="urn:oasis:names:tc:xliff:document:1.2" 
localName="xliff"/>
+    <glob pattern="*.xlf"/>
+    <glob pattern="*.xliff"/>
+  </mime-type>
+
   <mime-type type="text/x-rsrc">
     <_comment>R source code</_comment>
     <glob pattern="*.r"/>
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
new file mode 100644
index 0000000..954c217
--- /dev/null
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xliff;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Content Handler for XLIFF 1.2 documents.
+ */
+public class XLIFF12ContentHandler extends DefaultHandler {
+
+    private int numberOfFiles = 0;
+    private int numberOfTUs = 0;
+    private boolean inTransUnit = false;
+    private final XHTMLContentHandler xhtml;
+    private final Metadata metadata;
+
+    XLIFF12ContentHandler(XHTMLContentHandler xhtml, Metadata metadata) {
+        this.xhtml = xhtml;
+        this.metadata = metadata;
+    }
+
+    @Override
+    public void startElement(
+            String uri, String localName, String qName, Attributes attributes)
+            throws SAXException {
+
+        final AttributesImpl attributeVals = new AttributesImpl();
+        attributeVals.setAttributes(attributes);
+
+        if ("file".equals(localName)) {
+            numberOfFiles++;
+
+            // Write out the original file name
+            metadata.add("original", attributes.getValue("source-language"));
+
+            xhtml.startElement("div");
+            xhtml.startElement("h1");
+            xhtml.characters(attributes.getValue("original"));
+            xhtml.endElement("h1");
+
+            // Add the files source and target languages
+            metadata.add("source-language", 
attributes.getValue("source-language"));
+            metadata.add("target-language", 
attributes.getValue("target-language"));
+        }
+
+        if ("trans-unit".equals(localName)) {
+            numberOfTUs++;
+            inTransUnit = true;
+            xhtml.startElement("div", attributeVals);
+        }
+
+        if ("source".equals(localName)) {
+            xhtml.startElement("p", attributeVals);
+        }
+
+        if ("target".equals(localName)) {
+            xhtml.startElement("p", attributeVals);
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws 
SAXException {
+
+        if ("file".equals(localName)) {
+            xhtml.endElement("div");
+        }
+
+        if ("trans-unit".equals(localName)) {
+            inTransUnit = false;
+            xhtml.endElement("div");
+        }
+
+        if ("source".equals(localName)) {
+            xhtml.endElement("p");
+        }
+
+        if ("target".equals(localName)) {
+            xhtml.endElement("p");
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        if (inTransUnit && length != 0) {
+            xhtml.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void endDocument() {
+        metadata.set("file-count", String.valueOf(numberOfFiles));
+        metadata.set("tu-count", String.valueOf(numberOfTUs));
+    }
+
+}
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java 
b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
new file mode 100644
index 0000000..40218b0
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xliff;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+/**
+ * Parser for XLIFF 1.2 files.
+ */
+public class XLIFF12Parser extends AbstractParser {
+
+    /**
+     * Serial Version UID.
+     */
+    private static final long serialVersionUID = 1490085649251663857L;
+
+    /**
+     * Pre-Xliff 2.0 mime type.
+     */
+    private static final MediaType XLF_CONTENT_TYPE = 
MediaType.application("x-xliff+xml");
+
+    /**
+     * Supported types set.
+     */
+    private static final Set<MediaType> SUPPORTED_TYPES = 
Collections.singleton(XLF_CONTENT_TYPE);
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        return SUPPORTED_TYPES;
+    }
+
+    @Override
+    public void parse(
+            InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+
+        metadata.set(Metadata.CONTENT_TYPE, XLF_CONTENT_TYPE.toString());
+
+        final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata);
+        XMLReaderUtils.parseSAX(
+                new CloseShieldInputStream(stream),
+                new OfflineContentHandler(new XLIFF12ContentHandler(xhtml, 
metadata)),
+                context);
+
+    }
+
+}
diff --git 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index a33a578..d6edd56 100644
--- 
a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ 
b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -81,3 +81,4 @@ org.apache.tika.parser.external.CompositeExternalParser
 org.apache.tika.parser.journal.JournalParser
 org.apache.tika.parser.image.ICNSParser
 org.apache.tika.parser.dbf.DBFParser
+org.apache.tika.parser.xliff.XLIFF12Parser
\ No newline at end of file
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
new file mode 100644
index 0000000..9f69ea5
--- /dev/null
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.xliff;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertEquals;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+public class XLIFF12ParserTest {
+
+    @Test
+    public void testXLIFF12() throws Exception {
+        try (InputStream input = 
XLIFF12ParserTest.class.getResourceAsStream("/test-documents/testXLIFF12.xlf")) 
{
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            new XLIFF12Parser().parse(input, handler, metadata, new 
ParseContext());
+            String content = handler.toString();
+            assertContains("Hooray", content);
+            assertEquals("2", metadata.get("file-count"));
+            assertEquals("4", metadata.get("tu-count"));
+            assertEquals("en", metadata.get("source-language"));
+            assertEquals("fr", metadata.get("target-language"));
+        }
+    }
+
+
+}
diff --git a/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlf 
b/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlf
new file mode 100644
index 0000000..13b01ce
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testXLIFF12.xlf
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<xliff version="1.2"
+       xmlns="urn:oasis:names:tc:xliff:document:1.2"
+       xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+       xsi:schemaLocation="urn:oasis:names:tc:xliff:document:1.2 
xliff-core-1.2-strict.xsd">
+    <file original="sample.properties"
+          source-language="en" target-language="fr"
+          datatype="javapropertyresourcebundle">
+        <body>
+            <trans-unit id="1" resname="welcome">
+                <source xml:lang="en">Hooray, you're here! The day just got 
better - enjoy the following tips!</source>
+                <target xml:lang="fr">Hurrah, tu es là. Ça fait une bonne 
journée - profite des renseignements suivants !</target>
+            </trans-unit>
+            <trans-unit id="2" resname="text_segment">
+                <source xml:lang="en">A section of text like this is known as 
a text segment. Start rockin' your translations now!</source>
+                <target xml:lang="fr">Un tel extrait est qualifié de segment 
du texte chez nous. C’est parti – commence tes traductions tout de 
suite/maintenant !</target>
+            </trans-unit>
+            <trans-unit id="3" resname="tab_shortcut">
+                <source xml:lang="en">Arriba, Arriba! Andale, Andale! Be fast 
as Speedy Gonzales. Just hit TAB to save and go to the next text segment, once 
you're done.</source>
+                <target xml:lang="fr">Arriba, Arriba ! Andale, Andale ! Prends 
Speedy Gonzales de vitesse. Appuie sur TAB pour sauvegarder et continuer avec 
le segment prochain dès que tu as fini.</target>
+            </trans-unit>
+        </body>
+    </file>
+
+    <file original="sample2.properties"
+          source-language="en" target-language="fr"
+          datatype="javapropertyresourcebundle">
+        <body>
+            <trans-unit id="4" resname="welcome">
+                <source xml:lang="en">Another trans-unit</source>
+                <target xml:lang="fr">Un autre trans-unit</target>
+            </trans-unit>
+        </body>
+    </file>
+</xliff>
\ No newline at end of file

Reply via email to