Author: nick
Date: Fri Jul 8 13:51:49 2011
New Revision: 1144314
URL: http://svn.apache.org/viewvc?rev=1144314&view=rev
Log:
TIKA-679 Update the CADKEY PRT parser to get the description, and tweak the
text encoding based on work by Troy
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt
(with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java?rev=1144314&r1=1144313&r2=1144314&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
Fri Jul 8 13:51:49 2011
@@ -18,6 +18,7 @@ package org.apache.tika.parser.prt;
import java.io.IOException;
import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.Set;
@@ -46,6 +47,10 @@ public class PRTParser extends AbstractP
return SUPPORTED_TYPES;
}
+ /**
+ * How long do we allow a text run to claim to be, before we
+ * decide we're confused and it's not really text after all?
+ */
private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
/*
@@ -81,6 +86,13 @@ public class PRTParser extends AbstractP
}
metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
+ // The description, if set, is the next up-to-500 bytes
+ byte[] desc = new byte[500];
+ IOUtils.readFully(stream, desc);
+ String description = extractText(desc, true);
+ if(description.length() > 0) {
+ metadata.set(Metadata.DESCRIPTION, description);
+ }
// Now look for text
while( (read = stream.read()) > -1) {
@@ -173,8 +185,7 @@ public class PRTParser extends AbstractP
return;
}
- // TODO Is this the right character set?
- String text = new String(str, 0, length-1, "UTF-8");
+ String text = extractText(str, false);
xhtml.startElement("p");
xhtml.characters(text);
@@ -182,6 +193,38 @@ public class PRTParser extends AbstractP
}
/**
+ * Does our best to turn the bytes into text
+ */
+ private String extractText(byte[] data, boolean trim) throws TikaException
{
+ // The text is always stored null terminated, but sometimes
+ // may have extra null padding too
+ int length = data.length - 1;
+ if(trim) {
+ for(int i=0; i<data.length; i++) {
+ if(data[i] == 0) {
+ length = i;
+ break;
+ }
+ }
+ }
+
+ // We believe that the text is basically stored as CP437
+ // That said, there are a few characters slightly wrong for that...
+ String text;
+ try {
+ text = new String(data, 0, length, "cp437");
+ } catch(UnsupportedEncodingException e) {
+ throw new TikaException("JVM Broken, core codepage CP437 missing!");
+ }
+
+ // Fix up the known character issues
+ text = text.replace("\u03C6","\u00D8");
+
+ // All done, as best as we can!
+ return text;
+ }
+
+ /**
* Provides a view on the previous 5 bytes
*/
private static class Last5 {
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java?rev=1144314&r1=1144313&r2=1144314&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
Fri Jul 8 13:51:49 2011
@@ -25,7 +25,10 @@ import org.apache.tika.sax.BodyContentHa
import org.xml.sax.ContentHandler;
public class PRTParserTest extends TestCase {
- public void testPRTParser() throws Exception {
+ /**
+ * Try with a simple file
+ */
+ public void testPRTParserBasics() throws Exception {
InputStream input = PRTParserTest.class.getResourceAsStream(
"/test-documents/testCADKEY.prt");
try {
@@ -35,10 +38,13 @@ public class PRTParserTest extends TestC
assertEquals("application/x-prt",
metadata.get(Metadata.CONTENT_TYPE));
+ // This file has a date
assertEquals("2011-06-20T16:54:00",
metadata.get(Metadata.DATE));
assertEquals("2011-06-20T16:54:00",
metadata.get(Metadata.CREATION_DATE));
+ // But no description
+ assertEquals(null, metadata.get(Metadata.DESCRIPTION));
String contents = handler.toString();
@@ -57,6 +63,56 @@ public class PRTParserTest extends TestC
input.close();
}
}
+
+ /**
+ * Now a more complex one
+ */
+ public void testPRTParserComplex() throws Exception {
+ InputStream input = PRTParserTest.class.getResourceAsStream(
+ "/test-documents/testCADKEY2.prt");
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ new PRTParser().parse(input, handler, metadata);
+
+ assertEquals("application/x-prt",
metadata.get(Metadata.CONTENT_TYPE));
+
+ // File has both a date and a description
+ assertEquals("1997-04-01T08:59:00",
+ metadata.get(Metadata.DATE));
+ assertEquals("1997-04-01T08:59:00",
+ metadata.get(Metadata.CREATION_DATE));
+ assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n",
+ metadata.get(Metadata.DESCRIPTION));
+
+ String contents = handler.toString();
+
+ assertContains("ITEM", contents);
+ assertContains("REQ.", contents);
+ assertContains("DESCRIPTION", contents);
+ assertContains("MAT'L", contents);
+ assertContains("TOLERANCES UNLESS", contents);
+ assertContains("FRACTIONS", contents);
+ assertContains("ANGLES", contents);
+ assertContains("Acme Corporation", contents);
+
+ assertContains("DATE", contents);
+ assertContains("CHANGE", contents);
+ assertContains("DRAWN BY", contents);
+ assertContains("SCALE", contents);
+ assertContains("TIKA TEST DRAWING", contents);
+ assertContains("TIKA LETTERS", contents);
+ assertContains("5.82", contents);
+ assertContains("112"+'\u00b0', contents); // Degrees
+ assertContains("TIKA TEST LETTER", contents);
+ assertContains("17.11", contents);
+ assertContains('\u00d8'+" 2.000", contents); // Diameter
+ assertContains("Diameter", contents);
+ assertContains("The Apache Tika toolkit", contents);
+ } finally {
+ input.close();
+ }
+ }
public void assertContains(String needle, String haystack) {
assertTrue(needle + " not found in:\n" + haystack,
haystack.contains(needle));
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt?rev=1144314&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream