Author: nick Date: Fri Jul 8 13:51:49 2011 New Revision: 1144314 URL: http://svn.apache.org/viewvc?rev=1144314&view=rev Log: TIKA-679 Update the CADKEY PRT parser to get the description, and tweak the text encoding based on work by Troy
Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt (with props) Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java Modified: tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java?rev=1144314&r1=1144313&r2=1144314&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java (original) +++ tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java Fri Jul 8 13:51:49 2011 @@ -18,6 +18,7 @@ package org.apache.tika.parser.prt; import java.io.IOException; import java.io.InputStream; +import java.io.UnsupportedEncodingException; import java.util.Collections; import java.util.Set; @@ -46,6 +47,10 @@ public class PRTParser extends AbstractP return SUPPORTED_TYPES; } + /** + * How long do we allow a text run to claim to be, before we + * decide we're confused and it's not really text after all? + */ private static final int MAX_SANE_TEXT_LENGTH = 0x0800; /* @@ -81,6 +86,13 @@ public class PRTParser extends AbstractP } metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE); + // The description, if set, is the next up-to-500 bytes + byte[] desc = new byte[500]; + IOUtils.readFully(stream, desc); + String description = extractText(desc, true); + if(description.length() > 0) { + metadata.set(Metadata.DESCRIPTION, description); + } // Now look for text while( (read = stream.read()) > -1) { @@ -173,8 +185,7 @@ public class PRTParser extends AbstractP return; } - // TODO Is this the right character set? - String text = new String(str, 0, length-1, "UTF-8"); + String text = extractText(str, false); xhtml.startElement("p"); xhtml.characters(text); @@ -182,6 +193,38 @@ public class PRTParser extends AbstractP } /** + * Does our best to turn the bytes into text + */ + private String extractText(byte[] data, boolean trim) throws TikaException { + // The text is always stored null terminated, but sometimes + // may have extra null padding too + int length = data.length - 1; + if(trim) { + for(int i=0; i<data.length; i++) { + if(data[i] == 0) { + length = i; + break; + } + } + } + + // We believe that the text is basically stored as CP437 + // That said, there are a few characters slightly wrong for that... + String text; + try { + text = new String(data, 0, length, "cp437"); + } catch(UnsupportedEncodingException e) { + throw new TikaException("JVM Broken, core codepage CP437 missing!"); + } + + // Fix up the known character issues + text = text.replace("\u03C6","\u00D8"); + + // All done, as best as we can! + return text; + } + + /** * Provides a view on the previous 5 bytes */ private static class Last5 { Modified: tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java?rev=1144314&r1=1144313&r2=1144314&view=diff ============================================================================== --- tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java (original) +++ tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java Fri Jul 8 13:51:49 2011 @@ -25,7 +25,10 @@ import org.apache.tika.sax.BodyContentHa import org.xml.sax.ContentHandler; public class PRTParserTest extends TestCase { - public void testPRTParser() throws Exception { + /** + * Try with a simple file + */ + public void testPRTParserBasics() throws Exception { InputStream input = PRTParserTest.class.getResourceAsStream( "/test-documents/testCADKEY.prt"); try { @@ -35,10 +38,13 @@ public class PRTParserTest extends TestC assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE)); + // This file has a date assertEquals("2011-06-20T16:54:00", metadata.get(Metadata.DATE)); assertEquals("2011-06-20T16:54:00", metadata.get(Metadata.CREATION_DATE)); + // But no description + assertEquals(null, metadata.get(Metadata.DESCRIPTION)); String contents = handler.toString(); @@ -57,6 +63,56 @@ public class PRTParserTest extends TestC input.close(); } } + + /** + * Now a more complex one + */ + public void testPRTParserComplex() throws Exception { + InputStream input = PRTParserTest.class.getResourceAsStream( + "/test-documents/testCADKEY2.prt"); + try { + Metadata metadata = new Metadata(); + ContentHandler handler = new BodyContentHandler(); + new PRTParser().parse(input, handler, metadata); + + assertEquals("application/x-prt", metadata.get(Metadata.CONTENT_TYPE)); + + // File has both a date and a description + assertEquals("1997-04-01T08:59:00", + metadata.get(Metadata.DATE)); + assertEquals("1997-04-01T08:59:00", + metadata.get(Metadata.CREATION_DATE)); + assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n", + metadata.get(Metadata.DESCRIPTION)); + + String contents = handler.toString(); + + assertContains("ITEM", contents); + assertContains("REQ.", contents); + assertContains("DESCRIPTION", contents); + assertContains("MAT'L", contents); + assertContains("TOLERANCES UNLESS", contents); + assertContains("FRACTIONS", contents); + assertContains("ANGLES", contents); + assertContains("Acme Corporation", contents); + + assertContains("DATE", contents); + assertContains("CHANGE", contents); + assertContains("DRAWN BY", contents); + assertContains("SCALE", contents); + assertContains("TIKA TEST DRAWING", contents); + assertContains("TIKA LETTERS", contents); + assertContains("5.82", contents); + assertContains("112"+'\u00b0', contents); // Degrees + assertContains("TIKA TEST LETTER", contents); + assertContains("17.11", contents); + assertContains('\u00d8'+" 2.000", contents); // Diameter + assertContains("Diameter", contents); + assertContains("The Apache Tika toolkit", contents); + } finally { + input.close(); + } + } public void assertContains(String needle, String haystack) { assertTrue(needle + " not found in:\n" + haystack, haystack.contains(needle)); Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt URL: http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt?rev=1144314&view=auto ============================================================================== Binary file - no diff available. Propchange: tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream