Repository: tika Updated Branches: refs/heads/2.x 1ce93ed9e -> cd12917fa
fix indents and whitespace Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/76744261 Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/76744261 Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/76744261 Branch: refs/heads/2.x Commit: 767442614756b51b64427e663a2af1f6b6ac0bff Parents: 1ce93ed Author: tballison <[email protected]> Authored: Fri Jun 24 11:06:33 2016 -0400 Committer: tballison <[email protected]> Committed: Fri Jun 24 11:06:33 2016 -0400 ---------------------------------------------------------------------- .../org/apache/tika/parser/prt/PRTParser.java | 554 ++++++++++--------- 1 file changed, 279 insertions(+), 275 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/76744261/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java ---------------------------------------------------------------------- diff --git a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java index 92e3503..24418b0 100644 --- a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java +++ b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java @@ -1,275 +1,279 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.prt; - -import java.io.IOException; -import java.io.InputStream; -import java.io.UnsupportedEncodingException; -import java.util.Collections; -import java.util.Set; - -import org.apache.commons.io.IOUtils; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.EndianUtils; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.AbstractParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.sax.XHTMLContentHandler; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - -import static java.nio.charset.StandardCharsets.US_ASCII; - -/** - * A basic text extracting parser for the CADKey PRT (CAD Drawing) - * format. It outputs text from note entries. - */ - -public class PRTParser extends AbstractParser { - - /** Serial version UID */ - private static final long serialVersionUID = 4659638314375035178L; - - private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt")); - public static final String PRT_MIME_TYPE = "application/x-prt"; - - public Set<MediaType> getSupportedTypes(ParseContext context) { - return SUPPORTED_TYPES; - } - - /** - * How long do we allow a text run to claim to be, before we - * decide we're confused and it's not really text after all? - */ - private static final int MAX_SANE_TEXT_LENGTH = 0x0800; - - /* - * Text types: - * 00 00 00 00 f0 [3b]f sz sz TEXT *view name* - * 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name* - * (anything) e0 3f sz sz TEXT *view name* - * 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries* - * - * Note - all text is null terminated - */ - - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { - - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); - Last5 l5 = new Last5(); - int read; - - // Try to get the creation date, which is YYYYMMDDhhmm - byte[] header = new byte[30]; - IOUtils.readFully(stream, header); - byte[] date = new byte[12]; - IOUtils.readFully(stream, date); - - String dateStr = new String(date, US_ASCII); - if(dateStr.startsWith("19") || dateStr.startsWith("20")) { - String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4,6) + - "-" + dateStr.substring(6,8) + "T" + dateStr.substring(8,10) + ":" + - dateStr.substring(10, 12) + ":00"; - metadata.set(TikaCoreProperties.CREATED, formattedDate); - // TODO Metadata.DATE is used as modified, should it be here? - metadata.set(Metadata.DATE, formattedDate); - } - metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE); - - // The description, if set, is the next up-to-500 bytes - byte[] desc = new byte[500]; - IOUtils.readFully(stream, desc); - String description = extractText(desc, true); - if(description.length() > 0) { - metadata.set(TikaCoreProperties.DESCRIPTION, description); - } - - // Now look for text - while( (read = stream.read()) > -1) { - if(read == 0xe0 || read == 0xe3 || read == 0xf0) { - int nread = stream.read(); - if(nread == 0x3f || nread == 0xbf) { - // Looks promising, check back for a suitable value - if(read == 0xe3 && nread == 0x3f) { - if(l5.is33()) { - // Bingo, note text - handleNoteText(stream, xhtml); - } - } else if(l5.is00()) { - // Likely view name - handleViewName(read, nread, stream, xhtml, l5); - } - } - } else { - l5.record(read); - } - } - } - - private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml) - throws IOException, SAXException, TikaException { - // Ensure we have the right padding text - int read; - for(int i=0; i<10; i++) { - read = stream.read(); - if(read >= 0 && read <= 0x0f) { - // Promising - } else { - // Wrong, false detection - return; - } - } - read = stream.read(); - if(read != 0x1f) { - // Wrong, false detection - return; - } - - int length = EndianUtils.readUShortLE(stream); - if(length <= MAX_SANE_TEXT_LENGTH) { - // Length sanity check passed - handleText(length, stream, xhtml); - } - } - - private void handleViewName(int typeA, int typeB, InputStream stream, - XHTMLContentHandler xhtml, Last5 l5) - throws IOException, SAXException, TikaException { - // Is it 8 byte zero padded? - int maybeLength = EndianUtils.readUShortLE(stream); - if(maybeLength == 0) { - // Check the next 6 bytes too - for(int i=0; i<6; i++) { - int read = stream.read(); - if(read >= 0 && read <= 0x0f) { - // Promising - } else { - // Wrong, false detection - return; - } - } - - byte[] b2 = new byte[2]; - IOUtils.readFully(stream, b2); - int length = EndianUtils.getUShortLE(b2); - if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) { - // Length sanity check passed - handleText(length, stream, xhtml); - } else { - // Was probably something else - l5.record(b2[0]); - l5.record(b2[1]); - } - } else if(maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) { - // Looks like it's straight into the text - handleText(maybeLength, stream, xhtml); - } - } - - private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml) - throws IOException, SAXException, TikaException { - byte[] str = new byte[length]; - IOUtils.readFully(stream, str); - if(str[length-1] != 0) { - // Not properly null terminated, must be wrong - return; - } - - String text = extractText(str, false); - - xhtml.startElement("p"); - xhtml.characters(text); - xhtml.endElement("p"); - } - - /** - * Does our best to turn the bytes into text - */ - private String extractText(byte[] data, boolean trim) throws TikaException { - // The text is always stored null terminated, but sometimes - // may have extra null padding too - int length = data.length - 1; - if(trim) { - for(int i=0; i<data.length; i++) { - if(data[i] == 0) { - length = i; - break; - } - } - } - - // We believe that the text is basically stored as CP437 - // That said, there are a few characters slightly wrong for that... - String text; - try { - text = new String(data, 0, length, "cp437"); - } catch(UnsupportedEncodingException e) { - throw new TikaException("JVM Broken, core codepage CP437 missing!"); - } - - // Fix up the known character issues - text = text.replace("\u03C6","\u00D8"); - - // All done, as best as we can! - return text; - } - - /** - * Provides a view on the previous 5 bytes - */ - private static class Last5 { - byte[] data = new byte[5]; - int pos = 0; - - private void record(int b) { - data[pos] = (byte)b; - pos++; - if(pos >= data.length) { - pos = 0; - } - } - - private byte[] get() { - byte[] ret = new byte[5]; - for(int i=0; i<ret.length; i++) { - int p = pos - i; - if(p < 0) { p += ret.length; } - ret[i] = data[p]; - } - return ret; - } - - private boolean is33() { - byte[] last5 = get(); - for(byte b : last5) { - if(b != 0x33) return false; - } - return true; - } - - private boolean is00() { - byte[] last5 = get(); - for(byte b : last5) { - if(b != 0x00) return false; - } - return true; - } - } -} +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.prt; + +import static java.nio.charset.StandardCharsets.US_ASCII; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.Collections; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.EndianUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.XHTMLContentHandler; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +/** + * A basic text extracting parser for the CADKey PRT (CAD Drawing) + * format. It outputs text from note entries. + */ + +public class PRTParser extends AbstractParser { + + /** + * Serial version UID + */ + private static final long serialVersionUID = 4659638314375035178L; + + private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-prt")); + public static final String PRT_MIME_TYPE = "application/x-prt"; + + public Set<MediaType> getSupportedTypes(ParseContext context) { + return SUPPORTED_TYPES; + } + + /** + * How long do we allow a text run to claim to be, before we + * decide we're confused and it's not really text after all? + */ + private static final int MAX_SANE_TEXT_LENGTH = 0x0800; + + /* + * Text types: + * 00 00 00 00 f0 [3b]f sz sz TEXT *view name* + * 00 00 00 00 f0 3f 00 00 00 00 00 00 00 00 sz sz TEXT *view name* + * (anything) e0 3f sz sz TEXT *view name* + * 3x 33 33 33 33 33 e3 3f 0x 00 00 0x 00 00 0x 0x 1f sz sz TEXT *note entries* + * + * Note - all text is null terminated + */ + + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); + Last5 l5 = new Last5(); + int read; + + // Try to get the creation date, which is YYYYMMDDhhmm + byte[] header = new byte[30]; + IOUtils.readFully(stream, header); + byte[] date = new byte[12]; + IOUtils.readFully(stream, date); + + String dateStr = new String(date, US_ASCII); + if (dateStr.startsWith("19") || dateStr.startsWith("20")) { + String formattedDate = dateStr.substring(0, 4) + "-" + dateStr.substring(4, 6) + + "-" + dateStr.substring(6, 8) + "T" + dateStr.substring(8, 10) + ":" + + dateStr.substring(10, 12) + ":00"; + metadata.set(TikaCoreProperties.CREATED, formattedDate); + // TODO Metadata.DATE is used as modified, should it be here? + metadata.set(Metadata.DATE, formattedDate); + } + metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE); + + // The description, if set, is the next up-to-500 bytes + byte[] desc = new byte[500]; + IOUtils.readFully(stream, desc); + String description = extractText(desc, true); + if (description.length() > 0) { + metadata.set(TikaCoreProperties.DESCRIPTION, description); + } + + // Now look for text + while ((read = stream.read()) > -1) { + if (read == 0xe0 || read == 0xe3 || read == 0xf0) { + int nread = stream.read(); + if (nread == 0x3f || nread == 0xbf) { + // Looks promising, check back for a suitable value + if (read == 0xe3 && nread == 0x3f) { + if (l5.is33()) { + // Bingo, note text + handleNoteText(stream, xhtml); + } + } else if (l5.is00()) { + // Likely view name + handleViewName(read, nread, stream, xhtml, l5); + } + } + } else { + l5.record(read); + } + } + } + + private void handleNoteText(InputStream stream, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + // Ensure we have the right padding text + int read; + for (int i = 0; i < 10; i++) { + read = stream.read(); + if (read >= 0 && read <= 0x0f) { + // Promising + } else { + // Wrong, false detection + return; + } + } + read = stream.read(); + if (read != 0x1f) { + // Wrong, false detection + return; + } + + int length = EndianUtils.readUShortLE(stream); + if (length <= MAX_SANE_TEXT_LENGTH) { + // Length sanity check passed + handleText(length, stream, xhtml); + } + } + + private void handleViewName(int typeA, int typeB, InputStream stream, + XHTMLContentHandler xhtml, Last5 l5) + throws IOException, SAXException, TikaException { + // Is it 8 byte zero padded? + int maybeLength = EndianUtils.readUShortLE(stream); + if (maybeLength == 0) { + // Check the next 6 bytes too + for (int i = 0; i < 6; i++) { + int read = stream.read(); + if (read >= 0 && read <= 0x0f) { + // Promising + } else { + // Wrong, false detection + return; + } + } + + byte[] b2 = new byte[2]; + IOUtils.readFully(stream, b2); + int length = EndianUtils.getUShortLE(b2); + if (length > 1 && length <= MAX_SANE_TEXT_LENGTH) { + // Length sanity check passed + handleText(length, stream, xhtml); + } else { + // Was probably something else + l5.record(b2[0]); + l5.record(b2[1]); + } + } else if (maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) { + // Looks like it's straight into the text + handleText(maybeLength, stream, xhtml); + } + } + + private void handleText(int length, InputStream stream, XHTMLContentHandler xhtml) + throws IOException, SAXException, TikaException { + byte[] str = new byte[length]; + IOUtils.readFully(stream, str); + if (str[length - 1] != 0) { + // Not properly null terminated, must be wrong + return; + } + + String text = extractText(str, false); + + xhtml.startElement("p"); + xhtml.characters(text); + xhtml.endElement("p"); + } + + /** + * Does our best to turn the bytes into text + */ + private String extractText(byte[] data, boolean trim) throws TikaException { + // The text is always stored null terminated, but sometimes + // may have extra null padding too + int length = data.length - 1; + if (trim) { + for (int i = 0; i < data.length; i++) { + if (data[i] == 0) { + length = i; + break; + } + } + } + + // We believe that the text is basically stored as CP437 + // That said, there are a few characters slightly wrong for that... + String text; + try { + text = new String(data, 0, length, "cp437"); + } catch (UnsupportedEncodingException e) { + throw new TikaException("JVM Broken, core codepage CP437 missing!"); + } + + // Fix up the known character issues + text = text.replace("\u03C6", "\u00D8"); + + // All done, as best as we can! + return text; + } + + /** + * Provides a view on the previous 5 bytes + */ + private static class Last5 { + byte[] data = new byte[5]; + int pos = 0; + + private void record(int b) { + data[pos] = (byte) b; + pos++; + if (pos >= data.length) { + pos = 0; + } + } + + private byte[] get() { + byte[] ret = new byte[5]; + for (int i = 0; i < ret.length; i++) { + int p = pos - i; + if (p < 0) { + p += ret.length; + } + ret[i] = data[p]; + } + return ret; + } + + private boolean is33() { + byte[] last5 = get(); + for (byte b : last5) { + if (b != 0x33) return false; + } + return true; + } + + private boolean is00() { + byte[] last5 = get(); + for (byte b : last5) { + if (b != 0x00) return false; + } + return true; + } + } +}
