This is an automated email from the ASF dual-hosted git repository. dmeikle pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 833d0c738acacfc8ca295bfa7451bf427ee5c2ee Author: David Meikle <[email protected]> AuthorDate: Wed Dec 18 23:21:28 2019 +0000 TIKA-3014: Update to fix XLIFF12Parser failures with ToXMLHandler (cherry picked from commit e9ab0942ac7196429c4a297df9212792729e33f0) --- .../tika/parser/xliff/XLIFF12ContentHandler.java | 25 ++++++++++++++++++---- .../apache/tika/parser/xliff/XLIFF12Parser.java | 2 ++ .../tika/parser/xliff/XLIFF12ParserTest.java | 10 +++++++-- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java index 954c217..95ea20e 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12ContentHandler.java @@ -40,6 +40,11 @@ public class XLIFF12ContentHandler extends DefaultHandler { } @Override + public void startDocument() throws SAXException { + xhtml.startDocument(); + } + + @Override public void startElement( String uri, String localName, String qName, Attributes attributes) throws SAXException { @@ -58,9 +63,11 @@ public class XLIFF12ContentHandler extends DefaultHandler { xhtml.characters(attributes.getValue("original")); xhtml.endElement("h1"); - // Add the files source and target languages + // Add the files source (mandatory) and target (optional) languages metadata.add("source-language", attributes.getValue("source-language")); - metadata.add("target-language", attributes.getValue("target-language")); + if (null != attributes.getValue("target-language")) { + metadata.add("target-language", attributes.getValue("target-language")); + } } if ("trans-unit".equals(localName)) { @@ -70,12 +77,22 @@ public class XLIFF12ContentHandler extends DefaultHandler { } if ("source".equals(localName)) { - xhtml.startElement("p", attributeVals); + AttributesImpl attrs = extractAttributes(attributes); + xhtml.startElement("p", attrs); } if ("target".equals(localName)) { - xhtml.startElement("p", attributeVals); + AttributesImpl attrs = extractAttributes(attributes); + xhtml.startElement("p", attrs); + } + } + + private AttributesImpl extractAttributes(Attributes attributes) { + AttributesImpl attrs = new AttributesImpl(); + if (null != attributes.getValue("xml:lang")) { + attrs.addAttribute("", "lang", "lang", "", attributes.getValue("xml:lang")); } + return attrs; } @Override diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java index 40218b0..d65a09c 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/xliff/XLIFF12Parser.java @@ -23,6 +23,7 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.OfflineContentHandler; +import org.apache.tika.sax.TaggedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.XMLReaderUtils; import org.xml.sax.ContentHandler; @@ -67,6 +68,7 @@ public class XLIFF12Parser extends AbstractParser { metadata.set(Metadata.CONTENT_TYPE, XLF_CONTENT_TYPE.toString()); final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + XMLReaderUtils.parseSAX( new CloseShieldInputStream(stream), new OfflineContentHandler(new XLIFF12ContentHandler(xhtml, metadata)), diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java index 9f69ea5..d5e231b 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/xliff/XLIFF12ParserTest.java @@ -16,18 +16,18 @@ */ package org.apache.tika.parser.xliff; -import static org.apache.tika.TikaTest.assertContains; import static org.junit.Assert.assertEquals; import java.io.InputStream; +import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.junit.Test; import org.xml.sax.ContentHandler; -public class XLIFF12ParserTest { +public class XLIFF12ParserTest extends TikaTest { @Test public void testXLIFF12() throws Exception { @@ -44,5 +44,11 @@ public class XLIFF12ParserTest { } } + @Test + public void testXLIFF12ToXMLHandler() throws Exception { + String xml = getXML("testXLIFF12.xlf").xml; + assertContains("<p lang=\"en\">Another trans-unit</p>", xml); + assertContains("<p lang=\"fr\">Un autre trans-unit</p>", xml); + } }
