This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6a733506581d263b545a4a40767b116a3659b12e Author: tallison <[email protected]> AuthorDate: Tue Jun 2 08:50:54 2020 -0400 TIKA-3104 -- fix markup to be closer to plist xml, add embedded document extractor --- .../org/apache/tika/parser/apple/PListParser.java | 116 ++++++++++++++++----- .../apache/tika/parser/apple/PListParserTest.java | 8 +- 2 files changed, 99 insertions(+), 25 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java index ff56efe..643a611 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/PListParser.java @@ -26,8 +26,9 @@ import com.dd.plist.NSSet; import com.dd.plist.NSString; import com.dd.plist.PropertyListFormatException; import com.dd.plist.PropertyListParser; -import com.lexicalscope.jewelcli.internal.cglib.asm.$MethodAdapter; import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -41,30 +42,44 @@ import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; import java.io.IOException; import java.io.InputStream; +import java.text.DateFormat; import java.text.ParseException; +import java.text.SimpleDateFormat; import java.util.Collections; import java.util.Map; import java.util.Set; /** * Parser for Apple's plist and bplist. This is a wrapper around - * <groupId>com.googlecode.plist</groupId> - * <artifactId>dd-plist</artifactId> - * <version>1.23</version> + * com.googlecode.plist:dd-plist */ public class PListParser extends AbstractParser { + private static final String ARR = "array"; + private static final String DATA = "data"; + private static final String DATE = "date"; + private static final String DICT = "dict"; + private static final String KEY = "key"; + private static final String NUMBER = "number"; + private static final String PLIST = "plist"; + private static final String SET = "set"; + private static final String STRING = "string"; + + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-bplist")); - @Override public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + EmbeddedDocumentExtractor embeddedDocumentExtractor = + EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); + DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ"); NSObject rootObj = null; try { if (stream instanceof TikaInputStream && ((TikaInputStream) stream).hasFile()) { @@ -76,47 +91,100 @@ public class PListParser extends AbstractParser { throw new TikaException("problem parsing root", e); } XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + State state = new State(xhtml, metadata, embeddedDocumentExtractor, df); xhtml.startDocument(); - parseObject(rootObj, xhtml, metadata); + xhtml.startElement(PLIST); + parseObject(rootObj, state); + xhtml.endElement(PLIST); xhtml.endDocument(); } - private void parseObject(NSObject obj, XHTMLContentHandler handler, Metadata metadata) - throws SAXException { + private void parseObject(NSObject obj, State state) + throws SAXException, IOException { if (obj instanceof NSDictionary) { - parseDict((NSDictionary)obj, handler, metadata); + parseDict((NSDictionary)obj, state); } else if (obj instanceof NSArray) { NSArray nsArray = (NSArray)obj; + state.xhtml.startElement(ARR); for (NSObject child : nsArray.getArray()) { - parseObject(child, handler, metadata); + parseObject(child, state); } + state.xhtml.endElement(ARR); } else if (obj instanceof NSString) { - handler.characters(((NSString)obj).toString()); + state.xhtml.startElement(STRING); + state.xhtml.characters(((NSString)obj).getContent()); + state.xhtml.endElement(STRING); } else if (obj instanceof NSNumber) { - handler.characters(((NSNumber) obj).toString()); + state.xhtml.startElement(NUMBER); + state.xhtml.characters(((NSNumber) obj).toString()); + state.xhtml.endElement(NUMBER); } else if (obj instanceof NSData) { - handleData((NSData) obj, handler, metadata); + state.xhtml.startElement(DATA); + handleData((NSData) obj, state); + state.xhtml.endElement(DATA); } else if (obj instanceof NSDate) { - handler.characters(((NSDate)obj).toString()); - } else{ - throw new UnsupportedOperationException("don't know baout: "+obj.getClass()); + state.xhtml.startElement(DATE); + String dateString = state.dateFormat.format(((NSDate)obj).getDate()); + state.xhtml.characters(dateString); + state.xhtml.endElement(DATE); + } else if (obj instanceof NSSet) { + state.xhtml.startElement(SET); + parseSet((NSSet)obj, state); + state.xhtml.endElement(SET); + } else { + throw new UnsupportedOperationException("don't yet support this type of object: "+obj.getClass()); + } + } + private void parseSet(NSSet obj, State state) + throws SAXException, IOException { + state.xhtml.startElement(SET); + for (NSObject child : obj.allObjects()) { + parseObject(child, state); } + state.xhtml.endElement(SET); } - private void parseDict(NSDictionary obj, XHTMLContentHandler xhtml, Metadata metadata) throws SAXException { + private void parseDict(NSDictionary obj, State state) + throws SAXException, IOException { + state.xhtml.startElement(DICT); for (Map.Entry<String, NSObject> mapEntry : obj.getHashMap().entrySet()) { String key = mapEntry.getKey(); NSObject value = mapEntry.getValue(); - xhtml.startElement("div", "class", key); - parseObject(value, xhtml, metadata); - xhtml.endElement("div"); + state.xhtml.element(KEY, key); + parseObject(value, state); + } + state.xhtml.endElement(DICT); + } + + private void handleData(NSData value, State state) throws IOException, + SAXException { + state.xhtml.characters(value.getBase64EncodedData()); + Metadata embeddedMetadata = new Metadata(); + if (! state.embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { + return; + } + + try (TikaInputStream tis = TikaInputStream.get(value.bytes())) { + state.embeddedDocumentExtractor.parseEmbedded(tis, state.xhtml, embeddedMetadata, false); } } - private void handleData(NSData value, XHTMLContentHandler handler, Metadata metadata) { - byte[] bytes = value.bytes(); - //TODO handle embedded file + private static class State { + final XHTMLContentHandler xhtml; + final Metadata metadata; + final EmbeddedDocumentExtractor embeddedDocumentExtractor; + final DateFormat dateFormat; + + public State(XHTMLContentHandler xhtml, + Metadata metadata, + EmbeddedDocumentExtractor embeddedDocumentExtractor, + DateFormat df) { + this.xhtml = xhtml; + this.metadata = metadata; + this.embeddedDocumentExtractor = embeddedDocumentExtractor; + this.dateFormat = df; + } } } diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java index 534f65b..9d78548 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/apple/PListParserTest.java @@ -23,6 +23,8 @@ import org.junit.Test; import java.util.List; +import static org.junit.Assert.assertEquals; + public class PListParserTest extends TikaTest { @@ -31,8 +33,12 @@ public class PListParserTest extends TikaTest { //test file is MIT licensed: // https://github.com/joeferner/node-bplist-parser/blob/master/test/iTunes-small.bplist List<Metadata> metadataList = getRecursiveMetadata("testBPList.bplist"); + assertEquals(21, metadataList.size()); Metadata m = metadataList.get(0); String content = m.get(AbstractRecursiveParserWrapperHandler.TIKA_CONTENT); - assertContains("<div class=\"Application Version\">9.0.3</div>", content); + assertContains("<key>Application Version</key><string>9.0", content); + + //TODO -- bad encoding right after this...smart quote? + assertContains("<string>90", content); } }
