[
https://issues.apache.org/jira/browse/TIKA-679?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=13059601#comment-13059601
]
Nick Burch commented on TIKA-679:
---------------------------------
I've added the detector part in r1142795, thanks for the file and the match line
For the special characters you mention, which text entry in the file contains
them? And where?
I'll start taking a look at the parser shortly
> Proposal for PRT Parser
> -----------------------
>
> Key: TIKA-679
> URL: https://issues.apache.org/jira/browse/TIKA-679
> Project: Tika
> Issue Type: Improvement
> Components: mime, parser
> Reporter: Troy Witthoeft
> Priority: Minor
> Labels: CAD, Mime, Parser, Prt, Tika
> Attachments: TikaTest.prt
>
> Original Estimate: 672h
> Remaining Estimate: 672h
>
> It would be nice if Tika had support for prt CAD files.
> A preliminary prt text extractor has been created.
> Any assistance further developing this code is appreciated.
> {code:title=PRTParser.java|borderStyle=solid}
> package org.apache.tika.parser.prt;
> import java.io.BufferedInputStream;
> import java.io.BufferedReader;
> import java.io.IOException;
> import java.io.InputStream;
> import java.io.InputStreamReader;
> import java.io.Reader;
> import java.io.UnsupportedEncodingException;
> import java.nio.charset.Charset;
> import java.util.Collections;
> import java.util.Set;
> import org.apache.poi.util.IOUtils;
> import org.apache.tika.exception.TikaException;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.mime.MediaType;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
> import org.apache.tika.sax.XHTMLContentHandler;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> /**
> * Description: PRT (CAD Drawing) parser. This is a very basic parser.
> * Searches for specific byte prefix, and outputs text from note entities
> * Does not support special DRAFT-PAK characters.
> */
> public class PRTParser implements Parser {
> private static final Set<MediaType> SUPPORTED_TYPES =
> Collections.singleton(MediaType.application("prt"));
> public static final String PRT_MIME_TYPE = "application/prt";
>
> public Set<MediaType> getSupportedTypes(ParseContext context) {
> return SUPPORTED_TYPES;
> }
>
> public void parse(
> InputStream stream, ContentHandler handler,
> Metadata metadata, ParseContext context)
> throws IOException, SAXException, TikaException {
> XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
> metadata);
> int[] prefix = new int[] {227, 63};
> //Looking for a prefix set of bytes {E3, 3F}
> int pos = 0;
>
> int read;
> while( (read = stream.read()) > -1) {
> // stream.read() moves to the next byte, and returns an integer value
> of the byte. a value of -1 signals the EOF
> if(read == prefix[pos]) {
> // is the last byte read the same as the
> first byte in the prefix?
> pos++;
>
> if(pos == prefix.length) {
>
> stream.skip(11);
> // skip the 13 bytes
> of the prefix which can vary.
> int length = stream.read();
> // Set the next byte equal to
> the length of text in the user input field, see PRT schema
> stream.skip(1);
>
> byte[] text = new byte[length];
> // a new byte array called text is
> created. It should contain an array of integer values of the user inputted
> text.
> IOUtils.readFully(stream, text);
>
> String str = new String(text, 0,
> text.length, "UTF-8"); // turn it into a string, but does not remove null
> termination, assumes it's found to be utf-8
> xhtml.startElement("p");
> xhtml.characters(str);
> xhtml.endElement("p");
> pos--;
> }
> }
> else {
> //Did not find the prefix. Reset the position
> counter.
> pos = 0;
> }
> }
> }
>
> /**
> * @deprecated This method will be removed in Apache Tika 1.0.
> */
> public void parse(
> InputStream stream, ContentHandler handler, Metadata
> metadata)
> throws IOException, SAXException, TikaException {
> parse(stream, handler, metadata, new ParseContext());
> }
> }{code}
>
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira