TIKA-2022 -- add applefile parser
Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/0f3b0bdb Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/0f3b0bdb Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/0f3b0bdb Branch: refs/heads/master Commit: 0f3b0bdb5b78177e9f0fca88f889e7919823c177 Parents: 47221b9 Author: tballison <[email protected]> Authored: Mon Jun 27 09:28:07 2016 -0400 Committer: tballison <[email protected]> Committed: Mon Jun 27 09:28:07 2016 -0400 ---------------------------------------------------------------------- .../parser/apple/AppleSingleFileParser.java | 205 +++++++++++++++++++ .../services/org.apache.tika.parser.Parser | 1 + .../parser/apple/AppleSingleFileParserTest.java | 46 +++++ .../test-documents/testAppleSingleFile.pdf | Bin 0 -> 54926 bytes 4 files changed, 252 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/0f3b0bdb/tika-parsers/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java new file mode 100644 index 0000000..789629e --- /dev/null +++ b/tika-parsers/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java @@ -0,0 +1,205 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.apple; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.extractor.EmbeddedDocumentExtractor; +import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; +import org.apache.tika.io.CloseShieldInputStream; +import org.apache.tika.io.EndianUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * Parser that strips the header off of AppleSingle and AppleDouble + * files. + * <p> + * See <a href="http://kaiser-edv.de/documents/AppleSingle_AppleDouble.pdf">spec document</a>. + */ +public class AppleSingleFileParser extends AbstractParser { + + /** + * Entry types + */ + public static final int DATA_FORK = 1; + public static final int RESOURCE_FORK = 2; + public static final int REAL_NAME = 3; + public static final int COMMENT = 4; + public static final int ICON_BW = 5; + public static final int ICON_COLOR = 6; + //7?! + public static final int FILE_DATES_INFO = 8; + public static final int FINDER_INFO = 9; + public static final int MACINTOSH_FILE_INFO = 10; + public static final int PRODOS_FILE_INFO = 11; + public static final int MSDOS_FILE_INFO = 12; + public static final int SHORT_NAME = 13; + public static final int AFP_FILE_INFO = 14; + public static final int DIRECTORY_ID = 15; + + private static final Set<MediaType> SUPPORTED_TYPES = + Collections.singleton(MediaType.application("applefile")); + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, + Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + + EmbeddedDocumentExtractor ex = context.get(EmbeddedDocumentExtractor.class); + + if (ex == null) { + ex = new ParsingEmbeddedDocumentExtractor(context); + } + + short numEntries = readThroughNumEntries(stream); + long bytesRead = 26; + List<FieldInfo> fieldInfoList = getSortedFieldInfoList(stream, numEntries); + bytesRead += 12*numEntries; + Metadata embeddedMetadata = new Metadata(); + bytesRead = processFieldEntries(stream, fieldInfoList, embeddedMetadata, bytesRead); + FieldInfo contentFieldInfo = getContentFieldInfo(fieldInfoList); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + xhtml.startDocument(); + if (contentFieldInfo != null) { + System.out.println(contentFieldInfo.offset + " "+bytesRead); + long diff = contentFieldInfo.offset-bytesRead; + IOUtils.skipFully(stream, diff); + if (ex.shouldParseEmbedded(embeddedMetadata)) { + // TODO: we should probably add a readlimiting wrapper around this + // stream to ensure that not more than contentFieldInfo.length bytes + // are read + ex.parseEmbedded(new CloseShieldInputStream(stream), + xhtml, embeddedMetadata, false); + } + } + xhtml.endDocument(); + + } + + private FieldInfo getContentFieldInfo(List<FieldInfo> fieldInfoList) { + for (FieldInfo fieldInfo : fieldInfoList) { + if (fieldInfo.entryId == 1) { + return fieldInfo; + } + } + return null; + } + + private long processFieldEntries(InputStream stream, List<FieldInfo> fieldInfoList, + Metadata embeddedMetadata, long bytesRead) throws IOException, TikaException { + byte[] buffer = null; + for (FieldInfo f : fieldInfoList) { + long diff = f.offset - bytesRead; + //just in case + IOUtils.skipFully(stream, diff); + bytesRead += diff; + if (f.entryId == REAL_NAME) { + if (f.length > Integer.MAX_VALUE) { + throw new TikaException("File name length can't be > integer max"); + } + buffer = new byte[(int)f.length]; + IOUtils.readFully(stream, buffer); + bytesRead += f.length; + String originalFileName = new String(buffer, 0, buffer.length, StandardCharsets.US_ASCII); + //TODO: figure out correct metadata key + //embeddedMetadata.set(TikaCoreProperties.IDENTIFIER, originalFileName); + } else if (f.entryId != DATA_FORK) { + IOUtils.skipFully(stream, f.length); + bytesRead += f.length; + } + } + return bytesRead; + } + + + private List<FieldInfo> getSortedFieldInfoList(InputStream stream, short numEntries) throws IOException, TikaException { + //this is probably overkill. I'd hope that these were already + //in order. This ensures it. + List<FieldInfo> fieldInfoList = new ArrayList<>(numEntries); + for (int i = 0; i < numEntries; i++) { + //convert 32-bit unsigned ints to longs + fieldInfoList.add( + new FieldInfo( + EndianUtils.readIntBE(stream) & 0x00000000ffffffffL, //entry id + EndianUtils.readIntBE(stream) & 0x00000000ffffffffL, //offset + EndianUtils.readIntBE(stream) & 0x00000000ffffffffL //length + ) + ); + } + if (fieldInfoList.size() == 0) { + throw new TikaException("AppleSingleFile missing field info"); + } + //make absolutely sure these are in order! + Collections.sort(fieldInfoList, new FieldInfoComparator()); + return fieldInfoList; + } + + //read through header until you hit the number of entries + private short readThroughNumEntries(InputStream stream) throws TikaException, IOException { + //mime + EndianUtils.readIntBE(stream); + //version + long version = EndianUtils.readIntBE(stream); + if (version != 0x00020000) { + throw new TikaException("Version should have been 0x00020000, but was:"+version); + } + IOUtils.skipFully(stream, 16);//filler + return EndianUtils.readShortBE(stream);//number of entries + } + + private class FieldInfo { + + private final long entryId; + private final long offset; + private final long length; + + private FieldInfo(long entryId, long offset, long length) { + this.entryId = entryId; + this.offset = offset; + this.length = length; + } + } + + private static class FieldInfoComparator implements Comparator<FieldInfo> { + + @Override + public int compare(FieldInfo o1, FieldInfo o2) { + return (o1.offset > o2.offset) ? 1 : + (o1.offset == o2.offset) ? 0 : -1 ; + } + } + +} http://git-wip-us.apache.org/repos/asf/tika/blob/0f3b0bdb/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index 10a5a7e..6ed2f6c 100644 --- a/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +org.apache.tika.parser.apple.AppleSingleFileParser org.apache.tika.parser.asm.ClassParser org.apache.tika.parser.audio.AudioParser org.apache.tika.parser.audio.MidiParser http://git-wip-us.apache.org/repos/asf/tika/blob/0f3b0bdb/tika-parsers/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java new file mode 100644 index 0000000..5890e7e --- /dev/null +++ b/tika-parsers/src/test/java/org/apache/tika/parser/apple/AppleSingleFileParserTest.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.tika.parser.apple; + + +import static org.junit.Assert.assertEquals; + +import java.util.Arrays; +import java.util.List; + +import org.apache.tika.TikaTest; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.parser.pdf.PDFParser; +import org.junit.Test; + +public class AppleSingleFileParserTest extends TikaTest { + + @Test + public void testBasic() throws Exception { + List<Metadata> list = getRecursiveJson("testAppleSingleFile.pdf"); + assertEquals(list.size(), 2); + assertContains(AppleSingleFileParser.class.getName(), + Arrays.asList(list.get(0).getValues("X-Parsed-By"))); + assertContains(PDFParser.class.getName(), + Arrays.asList(list.get(1).getValues("X-Parsed-By"))); + assertContains("END OF SORTIE NUMBER TWO", list.get(1).get(RecursiveParserWrapper.TIKA_CONTENT)); + assertEquals("fltsyllabussortie2rev1.2", list.get(1).get(TikaCoreProperties.TITLE)); + } +} http://git-wip-us.apache.org/repos/asf/tika/blob/0f3b0bdb/tika-parsers/src/test/resources/test-documents/testAppleSingleFile.pdf ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/resources/test-documents/testAppleSingleFile.pdf b/tika-parsers/src/test/resources/test-documents/testAppleSingleFile.pdf new file mode 100644 index 0000000..a385313 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testAppleSingleFile.pdf differ
