Author: nick
Date: Fri Jul  8 13:51:49 2011
New Revision: 1144314

URL: http://svn.apache.org/viewvc?rev=1144314&view=rev
Log:
TIKA-679 Update the CADKEY PRT parser to get the description, and tweak the 
text encoding based on work by Troy

Added:
    tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt   
(with props)
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java?rev=1144314&r1=1144313&r2=1144314&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java 
Fri Jul  8 13:51:49 2011
@@ -18,6 +18,7 @@ package org.apache.tika.parser.prt;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
 import java.util.Collections;
 import java.util.Set;
 
@@ -46,6 +47,10 @@ public class PRTParser extends AbstractP
        return SUPPORTED_TYPES;
     }
     
+    /**
+     * How long do we allow a text run to claim to be, before we
+     * decide we're confused and it's not really text after all?
+     */
     private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
     
     /*
@@ -81,6 +86,13 @@ public class PRTParser extends AbstractP
        }
        metadata.set(Metadata.CONTENT_TYPE, PRT_MIME_TYPE);
        
+       // The description, if set, is the next up-to-500 bytes
+       byte[] desc = new byte[500];
+       IOUtils.readFully(stream, desc);
+       String description = extractText(desc, true);
+       if(description.length() > 0) {
+          metadata.set(Metadata.DESCRIPTION, description);
+       }
        
        // Now look for text
        while( (read = stream.read()) > -1) {
@@ -173,8 +185,7 @@ public class PRTParser extends AbstractP
           return;
        }
        
-       // TODO Is this the right character set?
-       String text = new String(str, 0, length-1, "UTF-8");
+       String text = extractText(str, false);
        
        xhtml.startElement("p");
        xhtml.characters(text);
@@ -182,6 +193,38 @@ public class PRTParser extends AbstractP
     }
     
     /**
+     * Does our best to turn the bytes into text
+     */
+    private String extractText(byte[] data, boolean trim) throws TikaException 
{
+       // The text is always stored null terminated, but sometimes
+       //  may have extra null padding too
+       int length = data.length - 1;
+       if(trim) {
+          for(int i=0; i<data.length; i++) {
+             if(data[i] == 0) {
+                length = i;
+                break;
+             }
+          }
+       }
+       
+       // We believe that the text is basically stored as CP437
+       // That said, there are a few characters slightly wrong for that...
+       String text;
+       try {
+          text = new String(data, 0, length, "cp437");
+       } catch(UnsupportedEncodingException e) {
+          throw new TikaException("JVM Broken, core codepage CP437 missing!");
+       }
+       
+       // Fix up the known character issues
+       text = text.replace("\u03C6","\u00D8");
+
+       // All done, as best as we can!
+       return text;
+    }
+    
+    /**
      * Provides a view on the previous 5 bytes
      */
     private static class Last5 {

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java?rev=1144314&r1=1144313&r2=1144314&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/prt/PRTParserTest.java
 Fri Jul  8 13:51:49 2011
@@ -25,7 +25,10 @@ import org.apache.tika.sax.BodyContentHa
 import org.xml.sax.ContentHandler;
 
 public class PRTParserTest extends TestCase {
-    public void testPRTParser() throws Exception {
+    /**
+     * Try with a simple file
+     */
+    public void testPRTParserBasics() throws Exception {
        InputStream input = PRTParserTest.class.getResourceAsStream(
              "/test-documents/testCADKEY.prt");
        try  {
@@ -35,10 +38,13 @@ public class PRTParserTest extends TestC
 
           assertEquals("application/x-prt", 
metadata.get(Metadata.CONTENT_TYPE));
 
+          // This file has a date
           assertEquals("2011-06-20T16:54:00",
                 metadata.get(Metadata.DATE));
           assertEquals("2011-06-20T16:54:00",
                 metadata.get(Metadata.CREATION_DATE));
+          // But no description
+          assertEquals(null, metadata.get(Metadata.DESCRIPTION));
 
           String contents = handler.toString();
           
@@ -57,6 +63,56 @@ public class PRTParserTest extends TestC
           input.close();
        }
     }
+
+    /**
+     * Now a more complex one
+     */
+    public void testPRTParserComplex() throws Exception {
+       InputStream input = PRTParserTest.class.getResourceAsStream(
+             "/test-documents/testCADKEY2.prt");
+       try  {
+          Metadata metadata = new Metadata();
+          ContentHandler handler = new BodyContentHandler();
+          new PRTParser().parse(input, handler, metadata);
+
+          assertEquals("application/x-prt", 
metadata.get(Metadata.CONTENT_TYPE));
+
+          // File has both a date and a description
+          assertEquals("1997-04-01T08:59:00",
+                metadata.get(Metadata.DATE));
+          assertEquals("1997-04-01T08:59:00",
+                metadata.get(Metadata.CREATION_DATE));
+          assertEquals("TIKA TEST PART DESCRIPTION INFORMATION\r\n",
+                metadata.get(Metadata.DESCRIPTION));
+
+          String contents = handler.toString();
+          
+          assertContains("ITEM", contents);
+          assertContains("REQ.", contents);
+          assertContains("DESCRIPTION", contents);
+          assertContains("MAT'L", contents);
+          assertContains("TOLERANCES UNLESS", contents);
+          assertContains("FRACTIONS", contents);
+          assertContains("ANGLES", contents);
+          assertContains("Acme Corporation", contents);
+
+          assertContains("DATE", contents);
+          assertContains("CHANGE", contents);
+          assertContains("DRAWN BY", contents);
+          assertContains("SCALE", contents);
+          assertContains("TIKA TEST DRAWING", contents);
+          assertContains("TIKA LETTERS", contents);
+          assertContains("5.82", contents);
+          assertContains("112"+'\u00b0', contents); // Degrees
+          assertContains("TIKA TEST LETTER", contents);
+          assertContains("17.11", contents);
+          assertContains('\u00d8'+" 2.000", contents); // Diameter
+          assertContains("Diameter", contents);
+          assertContains("The Apache Tika toolkit", contents);
+       } finally {
+          input.close();
+       }
+    }
     
     public void assertContains(String needle, String haystack) {
        assertTrue(needle + " not found in:\n" + haystack, 
haystack.contains(needle));

Added: tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt?rev=1144314&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testCADKEY2.prt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to