[
https://issues.apache.org/jira/browse/TIKA-679?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Troy Witthoeft updated TIKA-679:
--------------------------------
Description:
It would be nice if Tika had support for prt CAD files.
A preliminary prt text extractor has been created.
Any assistance further developing this code is appreciated.
{code:title=PRTParser.java|borderStyle=solid}
package org.apache.tika.parser.prt;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.Set;
import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Description: PRT (CAD Drawing) parser. This is a very basic parser.
* Searches for specific byte prefix, and outputs text from note entities
* Does not support special DRAFT-PAK characters.
*/
public class PRTParser implements Parser {
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("prt"));
public static final String PRT_MIME_TYPE = "application/prt";
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
int[] prefix = new int[] {227, 63};
//Looking for a prefix set of bytes {E3, 3F}
int pos = 0;
int read;
while( (read = stream.read()) > -1) {
// stream.read() moves to the next byte, and returns an integer value
of the byte. a value of -1 signals the EOF
if(read == prefix[pos]) {
// is the last byte read the same as the first
byte in the prefix?
pos++;
if(pos == prefix.length) {
stream.skip(11);
// skip the 13 bytes of
the prefix which can vary.
int length = stream.read();
// Set the next byte equal to
the length of text in the user input field, see PRT schema
stream.skip(1);
byte[] text = new byte[length];
// a new byte array called text is
created. It should contain an array of integer values of the user inputted
text.
IOUtils.readFully(stream, text);
String str = new String(text, 0,
text.length, "UTF-8"); // turn it into a string, but does not remove null
termination, assumes it's found to be utf-8
xhtml.startElement("p");
xhtml.characters(str);
xhtml.endElement("p");
pos--;
}
}
else {
//Did not find the prefix. Reset the position
counter.
pos = 0;
}
}
}
/**
* @deprecated This method will be removed in Apache Tika 1.0.
*/
public void parse(
InputStream stream, ContentHandler handler, Metadata
metadata)
throws IOException, SAXException, TikaException {
parse(stream, handler, metadata, new ParseContext());
}
}{code}
was:
It would be nice if Tika had support for prt CAD files.
A preliminary prt text extractor has been created.
{code:title=PRTParser.java|borderStyle=solid}
package org.apache.tika.parser.prt;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Collections;
import java.util.Set;
import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
* Description: PRT (CAD Drawing) parser. This is a very basic parser.
* Searches for specific byte prefix, and outputs text from note entities
* Does not support special DRAFT-PAK characters.
*/
public class PRTParser implements Parser {
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("prt"));
public static final String PRT_MIME_TYPE = "application/prt";
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
public void parse(
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
metadata);
int[] prefix = new int[] {227, 63};
//Looking for a prefix set of bytes {E3, 3F}
int pos = 0;
int read;
while( (read = stream.read()) > -1) {
// stream.read() moves to the next byte, and returns an integer value
of the byte. a value of -1 signals the EOF
if(read == prefix[pos]) {
// is the last byte read the same as the first
byte in the prefix?
pos++;
if(pos == prefix.length) {
stream.skip(11);
// skip the 13 bytes of
the prefix which can vary.
int length = stream.read();
// Set the next byte equal to
the length of text in the user input field, see PRT schema
stream.skip(1);
byte[] text = new byte[length];
// a new byte array called text is
created. It should contain an array of integer values of the user inputted
text.
IOUtils.readFully(stream, text);
String str = new String(text, 0,
text.length, "UTF-8"); // turn it into a string, but does not remove null
termination, assumes it's found to be utf-8
xhtml.startElement("p");
xhtml.characters(str);
xhtml.endElement("p");
pos--;
}
}
else {
//Did not find the prefix. Reset the position
counter.
pos = 0;
}
}
}
/**
* @deprecated This method will be removed in Apache Tika 1.0.
*/
public void parse(
InputStream stream, ContentHandler handler, Metadata
metadata)
throws IOException, SAXException, TikaException {
parse(stream, handler, metadata, new ParseContext());
}
}{code}
I am looking for assistance in improving this code. I am in the process of
picking apart the prt file structure.
Here are my findings.
The file header contains, a magic mime type, file creation date, and file
description.
The magic mime type can be identified with <match value="0M3C" type="string"
offset="8" />
If present, the file creation date is after the identifier. It is in format
YYYYMMDDhhmm. It is always in the same address, 0x001Eh-0x002Ah OR the
31st-43rd bytes.
If present, the user entered file description IMMEDIATELY follows date. Max
chars is 498. It is always at the same address, 0x002Bh-0x021Ch OR the
43rd-540th bytes. Terminated with [00][01][C8]
The goal is to extract the user entered text. User text is marked by a prefix
of 42 bytes. Newest entries are at the top of the file.
The prefix is always marked by the presence of six 3's and [E3][3F], that is
followed by 10 variable bytes, then a byte signifying the length of the user
input text + 1, and a null.
GUIDE
[3#][33][33][33][33][33][E3][3F][0#][00][00][0#][00][00][0#][0#][0#][1F][ln][00][USERINPUT
TEXT][00][xx]
EXAMPLE
[33][33][33][33][33][33][E3][3F][00][00][00][00][00][00][00][02][01][1F][05][00][54][49][4B][41][00][0B]
= TIKA
Any pointers on how to improve the code is appreciated.
> Proposal for PRT Parser
> -----------------------
>
> Key: TIKA-679
> URL: https://issues.apache.org/jira/browse/TIKA-679
> Project: Tika
> Issue Type: Improvement
> Components: mime, parser
> Reporter: Troy Witthoeft
> Priority: Minor
> Labels: CAD, Mime, Parser, Prt, Tika
> Attachments: TikaTest.prt
>
> Original Estimate: 672h
> Remaining Estimate: 672h
>
> It would be nice if Tika had support for prt CAD files.
> A preliminary prt text extractor has been created.
> Any assistance further developing this code is appreciated.
> {code:title=PRTParser.java|borderStyle=solid}
> package org.apache.tika.parser.prt;
> import java.io.BufferedInputStream;
> import java.io.BufferedReader;
> import java.io.IOException;
> import java.io.InputStream;
> import java.io.InputStreamReader;
> import java.io.Reader;
> import java.io.UnsupportedEncodingException;
> import java.nio.charset.Charset;
> import java.util.Collections;
> import java.util.Set;
> import org.apache.poi.util.IOUtils;
> import org.apache.tika.exception.TikaException;
> import org.apache.tika.metadata.Metadata;
> import org.apache.tika.mime.MediaType;
> import org.apache.tika.parser.ParseContext;
> import org.apache.tika.parser.Parser;
> import org.apache.tika.sax.XHTMLContentHandler;
> import org.xml.sax.ContentHandler;
> import org.xml.sax.SAXException;
> /**
> * Description: PRT (CAD Drawing) parser. This is a very basic parser.
> * Searches for specific byte prefix, and outputs text from note entities
> * Does not support special DRAFT-PAK characters.
> */
> public class PRTParser implements Parser {
> private static final Set<MediaType> SUPPORTED_TYPES =
> Collections.singleton(MediaType.application("prt"));
> public static final String PRT_MIME_TYPE = "application/prt";
>
> public Set<MediaType> getSupportedTypes(ParseContext context) {
> return SUPPORTED_TYPES;
> }
>
> public void parse(
> InputStream stream, ContentHandler handler,
> Metadata metadata, ParseContext context)
> throws IOException, SAXException, TikaException {
> XHTMLContentHandler xhtml = new XHTMLContentHandler(handler,
> metadata);
> int[] prefix = new int[] {227, 63};
> //Looking for a prefix set of bytes {E3, 3F}
> int pos = 0;
>
> int read;
> while( (read = stream.read()) > -1) {
> // stream.read() moves to the next byte, and returns an integer value
> of the byte. a value of -1 signals the EOF
> if(read == prefix[pos]) {
> // is the last byte read the same as the
> first byte in the prefix?
> pos++;
>
> if(pos == prefix.length) {
>
> stream.skip(11);
> // skip the 13 bytes
> of the prefix which can vary.
> int length = stream.read();
> // Set the next byte equal to
> the length of text in the user input field, see PRT schema
> stream.skip(1);
>
> byte[] text = new byte[length];
> // a new byte array called text is
> created. It should contain an array of integer values of the user inputted
> text.
> IOUtils.readFully(stream, text);
>
> String str = new String(text, 0,
> text.length, "UTF-8"); // turn it into a string, but does not remove null
> termination, assumes it's found to be utf-8
> xhtml.startElement("p");
> xhtml.characters(str);
> xhtml.endElement("p");
> pos--;
> }
> }
> else {
> //Did not find the prefix. Reset the position
> counter.
> pos = 0;
> }
> }
> }
>
> /**
> * @deprecated This method will be removed in Apache Tika 1.0.
> */
> public void parse(
> InputStream stream, ContentHandler handler, Metadata
> metadata)
> throws IOException, SAXException, TikaException {
> parse(stream, handler, metadata, new ParseContext());
> }
> }{code}
>
--
This message is automatically generated by JIRA.
For more information on JIRA, see: http://www.atlassian.com/software/jira