Author: nick Date: Fri Oct 7 21:05:22 2011 New Revision: 1180243 URL: http://svn.apache.org/viewvc?rev=1180243&view=rev Log: TIKA-749 Convert the DWG and PRT parsers to use the Tika endian util, rather than the POI one
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java?rev=1180243&r1=1180242&r2=1180243&view=diff ============================================================================== --- tika/trunk/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java (original) +++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java Fri Oct 7 21:05:22 2011 @@ -169,8 +169,204 @@ public class EndianUtils { (ch7 << 8) + (ch8 << 0); } + + + /** + * Get a LE short value from the beginning of a byte array + * + *@param data the byte array + *@return the short (16-bit) value + */ + public static short getShortLE(byte[] data) { + return getShortLE(data, 0); + } + /** + * Get a LE short value from a byte array + * + *@param data the byte array + *@param offset a starting offset into the byte array + *@return the short (16-bit) value + */ + public static short getShortLE(byte[] data, int offset) { + return (short)getUShortLE(data, offset); + } + + /** + * Get a LE unsigned short value from the beginning of a byte array + * + *@param data the byte array + *@return the unsigned short (16-bit) value in an int + */ + public static int getUShortLE(byte[] data) { + return getUShortLE(data, 0); + } + /** + * Get a LE unsigned short value from a byte array + * + *@param data the byte array + *@param offset a starting offset into the byte array + *@return the unsigned short (16-bit) value in an integer + */ + public static int getUShortLE(byte[] data, int offset) { + int b0 = data[offset] & 0xFF; + int b1 = data[offset+1] & 0xFF; + return (b1 << 8) + (b0 << 0); + } + + /** + * Get a BE short value from the beginning of a byte array + * + *@param data the byte array + *@return the short (16-bit) value + */ + public static short getShortBE(byte[] data) { + return getShortBE(data, 0); + } + /** + * Get a BE short value from a byte array + * + *@param data the byte array + *@param offset a starting offset into the byte array + *@return the short (16-bit) value + */ + public static short getShortBE(byte[] data, int offset) { + return (short)getUShortBE(data, offset); + } + + /** + * Get a BE unsigned short value from the beginning of a byte array + * + *@param data the byte array + *@return the unsigned short (16-bit) value in an int + */ + public static int getUShortBE(byte[] data) { + return getUShortBE(data, 0); + } + /** + * Get a BE unsigned short value from a byte array + * + *@param data the byte array + *@param offset a starting offset into the byte array + *@return the unsigned short (16-bit) value in an integer + */ + public static int getUShortBE(byte[] data, int offset) { + int b0 = data[offset] & 0xFF; + int b1 = data[offset+1] & 0xFF; + return (b0 << 8) + (b1 << 0); + } + + /** + * Get a LE int value from the beginning of a byte array + * + *@param data the byte array + *@return the int (32-bit) value + */ + public static int getIntLE(byte[] data) { + return getIntLE(data, 0); + } + /** + * Get a LE int value from a byte array + * + *@param data the byte array + *@param offset a starting offset into the byte array + *@return the int (32-bit) value + */ + public static int getIntLE(byte[] data, int offset) { + int i=offset; + int b0 = data[i++] & 0xFF; + int b1 = data[i++] & 0xFF; + int b2 = data[i++] & 0xFF; + int b3 = data[i++] & 0xFF; + return (b3 << 24) + (b2 << 16) + (b1 << 8) + (b0 << 0); + } /** + * Get a BE int value from the beginning of a byte array + * + *@param data the byte array + *@return the int (32-bit) value + */ + public static int getIntBE(byte[] data) { + return getIntBE(data, 0); + } + /** + * Get a BE int value from a byte array + * + *@param data the byte array + *@param offset a starting offset into the byte array + *@return the int (32-bit) value + */ + public static int getIntBE(byte[] data, int offset) { + int i=offset; + int b0 = data[i++] & 0xFF; + int b1 = data[i++] & 0xFF; + int b2 = data[i++] & 0xFF; + int b3 = data[i++] & 0xFF; + return (b0 << 24) + (b1 << 16) + (b2 << 8) + (b3 << 0); + } + + /** + * Get a LE unsigned int value from a byte array + * + *@param data the byte array + *@return the unsigned int (32-bit) value in a long + */ + public static long getUIntLE(byte[] data) { + return getUIntLE(data,0); + } + /** + * Get a LE unsigned int value from a byte array + * + *@param data the byte array + *@param offset a starting offset into the byte array + *@return the unsigned int (32-bit) value in a long + */ + public static long getUIntLE(byte[] data, int offset) { + long retNum = getIntLE(data, offset); + return retNum & 0x00FFFFFFFFl; + } + + /** + * Get a BE unsigned int value from a byte array + * + *@param data the byte array + *@return the unsigned int (32-bit) value in a long + */ + public static long getUIntBE(byte[] data) { + return getUIntBE(data,0); + } + /** + * Get a BE unsigned int value from a byte array + * + *@param data the byte array + *@param offset a starting offset into the byte array + *@return the unsigned int (32-bit) value in a long + */ + public static long getUIntBE(byte[] data, int offset) { + long retNum = getIntBE(data, offset); + return retNum & 0x00FFFFFFFFl; + } + + /** + * Get a LE long value from a byte array + * + *@param data the byte array + *@param offset a starting offset into the byte array + *@return the long (64-bit) value + */ + public static long getLongLE(byte[] data, int offset) { + long result = 0; + + for (int j = offset + LONG_SIZE - 1; j >= offset; j--) { + result <<= 8; + result |= 0xff & data[j]; + } + return result; + } + private static final int LONG_SIZE = 8; + + + /** * Convert an 'unsigned' byte to an integer. ie, don't carry across the * sign. * @@ -195,7 +391,9 @@ public class EndianUtils { return (short) ( data[offset] & 0xFF ); } + public static class BufferUnderrunException extends TikaException { + private static final long serialVersionUID = 8358288231138076276L; public BufferUnderrunException() { super("Insufficient data left in stream for required read"); } Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java?rev=1180243&r1=1180242&r2=1180243&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java Fri Oct 7 21:05:22 2011 @@ -22,9 +22,9 @@ import java.util.Collections; import java.util.Set; import org.apache.poi.util.IOUtils; -import org.apache.poi.util.LittleEndian; import org.apache.poi.util.StringUtil; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.EndianUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; @@ -130,7 +130,7 @@ public class DWGParser extends AbstractP */ private void get2004Props( InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) - throws IOException, SAXException { + throws IOException, TikaException, SAXException { // Standard properties for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) { String headerValue = read2004String(stream); @@ -148,8 +148,8 @@ public class DWGParser extends AbstractP } } - private String read2004String(InputStream stream) throws IOException { - int stringLen = LittleEndian.readUShort(stream); + private String read2004String(InputStream stream) throws IOException, TikaException { + int stringLen = EndianUtils.readUShortLE(stream); byte[] stringData = new byte[stringLen]; IOUtils.readFully(stream, stringData); @@ -167,7 +167,7 @@ public class DWGParser extends AbstractP */ private void get2007and2010Props( InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) - throws IOException, SAXException { + throws IOException, TikaException, SAXException { // Standard properties for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) { String headerValue = read2007and2010String(stream); @@ -185,8 +185,8 @@ public class DWGParser extends AbstractP } } - private String read2007and2010String(InputStream stream) throws IOException { - int stringLen = LittleEndian.readUShort(stream); + private String read2007and2010String(InputStream stream) throws IOException, TikaException { + int stringLen = EndianUtils.readUShortLE(stream); byte[] stringData = new byte[stringLen * 2]; IOUtils.readFully(stream, stringData); @@ -202,11 +202,11 @@ public class DWGParser extends AbstractP private void get2000Props( InputStream stream, Metadata metadata, XHTMLContentHandler xhtml) - throws IOException, SAXException { + throws IOException, TikaException, SAXException { int propCount = 0; while(propCount < 30) { - int propIdx = LittleEndian.readUShort(stream); - int length = LittleEndian.readUShort(stream); + int propIdx = EndianUtils.readUShortLE(stream); + int length = EndianUtils.readUShortLE(stream); int valueType = stream.read(); if(propIdx == 0x28) { @@ -262,9 +262,9 @@ public class DWGParser extends AbstractP * Grab the offset, then skip there */ private boolean skipToPropertyInfoSection(InputStream stream, byte[] header) - throws IOException { + throws IOException, TikaException { // The offset is stored in the header from 0x20 onwards - long offsetToSection = LittleEndian.getLong(header, 0x20); + long offsetToSection = EndianUtils.getLongLE(header, 0x20); long toSkip = offsetToSection - header.length; if(offsetToSection == 0){ return false; @@ -301,7 +301,7 @@ public class DWGParser extends AbstractP } private int skipToCustomProperties(InputStream stream) - throws IOException { + throws IOException, TikaException { // There should be 4 zero bytes next byte[] padding = new byte[4]; IOUtils.readFully(stream, padding); @@ -312,7 +312,7 @@ public class DWGParser extends AbstractP IOUtils.readFully(stream, padding); // We should now have the count - int count = LittleEndian.readUShort(stream); + int count = EndianUtils.readUShortLE(stream); // Sanity check it if(count > 0 && count < 0x7f) { Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java?rev=1180243&r1=1180242&r2=1180243&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java Fri Oct 7 21:05:22 2011 @@ -23,8 +23,8 @@ import java.util.Collections; import java.util.Set; import org.apache.poi.util.IOUtils; -import org.apache.poi.util.LittleEndian; import org.apache.tika.exception.TikaException; +import org.apache.tika.io.EndianUtils; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; @@ -135,7 +135,7 @@ public class PRTParser extends AbstractP return; } - int length = LittleEndian.readUShort(stream); + int length = EndianUtils.readUShortLE(stream); if(length <= MAX_SANE_TEXT_LENGTH) { // Length sanity check passed handleText(length, stream, xhtml); @@ -146,7 +146,7 @@ public class PRTParser extends AbstractP XHTMLContentHandler xhtml, Last5 l5) throws IOException, SAXException, TikaException { // Is it 8 byte zero padded? - int maybeLength = LittleEndian.readUShort(stream); + int maybeLength = EndianUtils.readUShortLE(stream); if(maybeLength == 0) { // Check the next 6 bytes too for(int i=0; i<6; i++) { @@ -161,7 +161,7 @@ public class PRTParser extends AbstractP byte[] b2 = new byte[2]; IOUtils.readFully(stream, b2); - int length = LittleEndian.getUShort(b2); + int length = EndianUtils.getUShortLE(b2); if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) { // Length sanity check passed handleText(length, stream, xhtml);